|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 2007-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "clustering/KMeans.h" 00013 #include "distance/Distance.h" 00014 #include "features/Labels.h" 00015 #include "features/SimpleFeatures.h" 00016 #include "lib/Mathematics.h" 00017 #include "base/Parallel.h" 00018 00019 #ifndef WIN32 00020 #include <pthread.h> 00021 #endif 00022 00023 #define MUSRECALC 00024 00025 #define PAR_THRESH 10 00026 00027 using namespace shogun; 00028 00029 CKMeans::CKMeans() 00030 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL), 00031 mus(NULL), Weights(NULL) 00032 { 00033 } 00034 00035 CKMeans::CKMeans(int32_t k_, CDistance* d) 00036 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL), 00037 mus(NULL), Weights(NULL) 00038 { 00039 set_distance(d); 00040 } 00041 00042 CKMeans::~CKMeans() 00043 { 00044 delete[] R; 00045 delete[] mus; 00046 } 00047 00048 bool CKMeans::train(CFeatures* data) 00049 { 00050 ASSERT(distance); 00051 00052 if (data) 00053 distance->init(data, data); 00054 00055 ASSERT(distance->get_feature_type()==F_DREAL); 00056 00057 CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs(); 00058 ASSERT(lhs); 00059 int32_t num=lhs->get_num_vectors(); 00060 SG_UNREF(lhs); 00061 00062 Weights=new float64_t[num]; 00063 for (int32_t i=0; i<num; i++) 00064 Weights[i]=1.0; 00065 00066 clustknb(false, NULL); 00067 delete[] Weights; 00068 00069 return true; 00070 } 00071 00072 bool CKMeans::load(FILE* srcfile) 00073 { 00074 return false; 00075 } 00076 00077 bool CKMeans::save(FILE* dstfile) 00078 { 00079 return false; 00080 } 00081 00082 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00083 struct thread_data 00084 { 00085 float64_t* x; 00086 CSimpleFeatures<float64_t>* y; 00087 float64_t* z; 00088 int32_t n1, n2, m; 00089 int32_t js, je; /* defines the matrix stripe */ 00090 int32_t offs; 00091 }; 00092 #endif // DOXYGEN_SHOULD_SKIP_THIS 00093 00094 namespace shogun 00095 { 00096 void *sqdist_thread_func(void * P) 00097 { 00098 struct thread_data *TD=(struct thread_data*) P; 00099 float64_t* x=TD->x; 00100 CSimpleFeatures<float64_t>* y=TD->y; 00101 float64_t* z=TD->z; 00102 int32_t n1=TD->n1, 00103 m=TD->m, 00104 js=TD->js, 00105 je=TD->je, 00106 offs=TD->offs, 00107 j,i,k; 00108 00109 for (j=js; j<je; j++) 00110 { 00111 int32_t vlen=0; 00112 bool vfree=false; 00113 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree); 00114 00115 for (i=0; i<n1; i++) 00116 { 00117 float64_t sum=0; 00118 for (k=0; k<m; k++) 00119 sum = sum + CMath::sq(x[i*m + k] - vec[k]); 00120 z[j*n1 + i] = sum; 00121 } 00122 00123 y->free_feature_vector(vec, j, vfree); 00124 } 00125 return NULL; 00126 } 00127 } 00128 00129 void CKMeans::sqdist( 00130 float64_t* x, CSimpleFeatures<float64_t>* y, float64_t* z, int32_t n1, int32_t offs, 00131 int32_t n2, int32_t m) 00132 { 00133 const int32_t num_threads=parallel->get_num_threads(); 00134 int32_t nc, n2_nc = n2/num_threads; 00135 thread_data* TD = new thread_data[num_threads]; 00136 pthread_t* tid = new pthread_t[num_threads]; 00137 void *status; 00138 00139 /* prepare the structure */ 00140 TD[0].x=x ; TD[0].y=y ; TD[0].z=z ; 00141 TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m; 00142 TD[0].offs=offs; 00143 00144 if (n2>PAR_THRESH) 00145 { 00146 ASSERT(PAR_THRESH>1); 00147 00148 /* create the threads */ 00149 for (nc=0; nc<num_threads; nc++) 00150 { 00151 TD[nc]=TD[0]; 00152 TD[nc].js=nc*n2_nc; 00153 if (nc+1==num_threads) 00154 TD[nc].je=n2; 00155 else 00156 TD[nc].je=(nc+1)*n2_nc; 00157 00158 pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]); 00159 } 00160 00161 /* wait for finishing all threads */ 00162 for (nc=0; nc<num_threads; nc++) 00163 pthread_join(tid[nc], &status); 00164 00165 } 00166 else 00167 { 00168 /* simply call the ,,thread''-function */ 00169 TD[0].js=0 ; TD[0].je=n2; 00170 sqdist_thread_func((void *)&TD[0]); 00171 } 00172 00173 delete[] tid; 00174 delete[] TD; 00175 } 00176 00177 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start) 00178 { 00179 ASSERT(distance && distance->get_feature_type()==F_DREAL); 00180 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs(); 00181 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0); 00182 00183 int32_t XSize=lhs->get_num_vectors(); 00184 dimensions=lhs->get_num_features(); 00185 int32_t i, changed=1; 00186 const int32_t XDimk=dimensions*k; 00187 int32_t iter=0; 00188 00189 delete[] R; 00190 R=new float64_t[k]; 00191 00192 delete[] mus; 00193 mus=new float64_t[XDimk]; 00194 00195 int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t)); 00196 float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t)); 00197 float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t)); 00199 CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0); 00200 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus); 00201 00202 int32_t vlen=0; 00203 bool vfree=false; 00204 float64_t* vec=NULL; 00205 00206 /* ClList=zeros(XSize,1) ; */ 00207 for (i=0; i<XSize; i++) ClList[i]=0; 00208 /* weights_set=zeros(k,1) ; */ 00209 for (i=0; i<k; i++) weights_set[i]=0; 00210 00211 /* mus=zeros(dimensions, k) ; */ 00212 for (i=0; i<XDimk; i++) mus[i]=0; 00213 00214 if (!use_old_mus) 00215 { 00216 /* random clustering (select random subsets) */ 00217 /* ks=ceil(rand(1,XSize)*k); 00218 * for i=1:k, 00219 * actks= (ks==i); 00220 * c=sum(actks); 00221 * weights_set(i)=c; 00222 * 00223 * ClList(actks)=i*ones(1, c); 00224 * 00225 * if ~mus_recalc, 00226 * if c>1 00227 * mus(:,i) = sum(XData(:,actks)')'/c; 00228 * elseif c>0 00229 * mus(:,i) = XData(:,actks); 00230 * end; 00231 * end; 00232 * end ; */ 00233 00234 for (i=0; i<XSize; i++) 00235 { 00236 const int32_t Cl=CMath::random(0, k-1); 00237 int32_t j; 00238 float64_t weight=Weights[i]; 00239 00240 weights_set[Cl]+=weight; 00241 ClList[i]=Cl; 00242 00243 vec=lhs->get_feature_vector(i, vlen, vfree); 00244 00245 for (j=0; j<dimensions; j++) 00246 mus[Cl*dimensions+j] += weight*vec[j]; 00247 00248 lhs->free_feature_vector(vec, i, vfree); 00249 } 00250 for (i=0; i<k; i++) 00251 { 00252 int32_t j; 00253 00254 if (weights_set[i]!=0.0) 00255 for (j=0; j<dimensions; j++) 00256 mus[i*dimensions+j] /= weights_set[i]; 00257 } 00258 } 00259 else 00260 { 00261 ASSERT(mus_start); 00262 00263 // sqdist(mus_start, lhs, dists, k, 0, XSize, dimensions); 00265 rhs_mus->copy_feature_matrix(mus_start,dimensions,k); 00266 float64_t* p_dists=dists; 00267 00268 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k) 00269 distances_rhs(p_dists,0,k,idx); 00270 p_dists=NULL; 00271 00272 for (i=0; i<XSize; i++) 00273 { 00274 float64_t mini=dists[i*k]; 00275 int32_t Cl = 0, j; 00276 00277 for (j=1; j<k; j++) 00278 { 00279 if (dists[i*k+j]<mini) 00280 { 00281 Cl=j; 00282 mini=dists[i*k+j]; 00283 } 00284 } 00285 ClList[i]=Cl; 00286 } 00287 00288 /* Compute the sum of all points belonging to a cluster 00289 * and count the points */ 00290 for (i=0; i<XSize; i++) 00291 { 00292 const int32_t Cl = ClList[i]; 00293 float64_t weight=Weights[i]; 00294 weights_set[Cl]+=weight; 00295 #ifndef MUSRECALC 00296 vec=lhs->get_feature_vector(i, vlen, vfree); 00297 00298 for (j=0; j<dimensions; j++) 00299 mus[Cl*dimensions+j] += weight*vec[j]; 00300 00301 lhs->free_feature_vector(vec, i, vfree); 00302 #endif 00303 } 00304 #ifndef MUSRECALC 00305 /* normalization to get the mean */ 00306 for (i=0; i<k; i++) 00307 { 00308 if (weights_set[i]!=0.0) 00309 { 00310 int32_t j; 00311 for (j=0; j<dimensions; j++) 00312 mus[i*dimensions+j] /= weights_set[i]; 00313 } 00314 } 00315 #endif 00316 } 00317 00318 00319 00320 while (changed && (iter<max_iter)) 00321 { 00322 iter++; 00323 if (iter==max_iter-1) 00324 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1); 00325 00326 if (iter%1000 == 0) 00327 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed); 00328 changed=0; 00329 00330 #ifdef MUSRECALC 00331 /* mus=zeros(dimensions, k) ; */ 00332 for (i=0; i<XDimk; i++) mus[i]=0; 00333 00334 for (i=0; i<XSize; i++) 00335 { 00336 int32_t j; 00337 int32_t Cl=ClList[i]; 00338 float64_t weight=Weights[i]; 00339 00340 vec=lhs->get_feature_vector(i, vlen, vfree); 00341 00342 for (j=0; j<dimensions; j++) 00343 mus[Cl*dimensions+j] += weight*vec[j]; 00344 00345 lhs->free_feature_vector(vec, i, vfree); 00346 } 00347 for (i=0; i<k; i++) 00348 { 00349 int32_t j; 00350 00351 if (weights_set[i]!=0.0) 00352 for (j=0; j<dimensions; j++) 00353 mus[i*dimensions+j] /= weights_set[i]; 00354 } 00355 #endif 00356 00357 rhs_mus->copy_feature_matrix(mus,dimensions,k); 00358 00359 for (i=0; i<XSize; i++) 00360 { 00361 /* ks=ceil(rand(1,XSize)*XSize) ; */ 00362 const int32_t Pat= CMath::random(0, XSize-1); 00363 const int32_t ClList_Pat=ClList[Pat]; 00364 int32_t imini, j; 00365 float64_t mini, weight; 00366 00367 weight=Weights[Pat]; 00368 00369 /* compute the distance of this point to all centers */ 00370 /* dists=sqdist(mus',XData) ; */ 00372 for(int32_t idx_k=0;idx_k<k;idx_k++) 00373 dists[idx_k]=distance->distance(Pat,idx_k); 00374 00375 /* [mini,imini]=min(dists(:,i)) ; */ 00376 imini=0 ; mini=dists[0]; 00377 for (j=1; j<k; j++) 00378 if (dists[j]<mini) 00379 { 00380 mini=dists[j]; 00381 imini=j; 00382 } 00383 00384 if (imini!=ClList_Pat) 00385 { 00386 changed= changed + 1; 00387 00388 /* weights_set(imini) = weights_set(imini) + weight ; */ 00389 weights_set[imini]+= weight; 00390 /* weights_set(j) = weights_set(j) - weight ; */ 00391 weights_set[ClList_Pat]-= weight; 00392 00393 /* mu_new=mu_old + (x - mu_old)/(n+1) */ 00394 /* mus(:,imini)=mus(:,imini) + (XData(:,i) - mus(:,imini)) * (weight / weights_set(imini)) ; */ 00395 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00396 00397 for (j=0; j<dimensions; j++) 00398 mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]); 00399 00400 lhs->free_feature_vector(vec, Pat, vfree); 00401 00402 /* mu_new = mu_old - (x - mu_old)/(n-1) */ 00403 /* if weights_set(j)~=0 */ 00404 if (weights_set[ClList_Pat]!=0.0) 00405 { 00406 /* mus(:,j)=mus(:,j) - (XData(:,i) - mus(:,j)) * (weight/weights_set(j)) ; */ 00407 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00408 00409 for (j=0; j<dimensions; j++) 00410 mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]); 00411 lhs->free_feature_vector(vec, Pat, vfree); 00412 } 00413 else 00414 /* mus(:,j)=zeros(dimensions,1) ; */ 00415 for (j=0; j<dimensions; j++) 00416 mus[ClList_Pat*dimensions+j]=0; 00417 00418 /* ClList(i)= imini ; */ 00419 ClList[Pat] = imini; 00420 } 00421 } 00422 } 00423 00424 /* compute the ,,variances'' of the clusters */ 00425 for (i=0; i<k; i++) 00426 { 00427 float64_t rmin1=0; 00428 float64_t rmin2=0; 00429 00430 bool first_round=true; 00431 00432 for (int32_t j=0; j<k; j++) 00433 { 00434 if (j!=i) 00435 { 00436 int32_t l; 00437 float64_t dist = 0; 00438 00439 for (l=0; l<dimensions; l++) 00440 dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]); 00441 00442 if (first_round) 00443 { 00444 rmin1=dist; 00445 rmin2=dist; 00446 first_round=false; 00447 } 00448 else 00449 { 00450 if ((dist<rmin2) && (dist>=rmin1)) 00451 rmin2=dist; 00452 00453 if (dist<rmin1) 00454 { 00455 rmin2=rmin1; 00456 rmin1=dist; 00457 } 00458 } 00459 } 00460 } 00461 00462 R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2)); 00463 } 00464 distance->replace_rhs(rhs_cache); 00465 delete rhs_mus; 00466 00467 free(ClList); 00468 free(weights_set); 00469 free(dists); 00470 SG_UNREF(lhs); 00471 }