|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 00011 #include "features/HashedWDFeaturesTransposed.h" 00012 #include "lib/io.h" 00013 #include "lib/Signal.h" 00014 #include "base/Parallel.h" 00015 00016 #ifndef WIN32 00017 #include <pthread.h> 00018 #endif 00019 00020 using namespace shogun; 00021 00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00023 struct HASHEDWD_THREAD_PARAM 00024 { 00025 CHashedWDFeaturesTransposed* hf; 00026 int32_t* sub_index; 00027 float64_t* output; 00028 int32_t start; 00029 int32_t stop; 00030 float64_t* alphas; 00031 float64_t* vec; 00032 float64_t bias; 00033 bool progress; 00034 uint32_t* index; 00035 }; 00036 #endif // DOXYGEN_SHOULD_SKIP_THIS 00037 00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str, 00039 int32_t start_order, int32_t order, int32_t from_order, 00040 int32_t hash_bits) : CDotFeatures() 00041 { 00042 ASSERT(start_order>=0); 00043 ASSERT(start_order<order); 00044 ASSERT(order<=from_order); 00045 ASSERT(hash_bits>0); 00046 ASSERT(str); 00047 ASSERT(str->have_same_length()); 00048 SG_REF(str); 00049 00050 strings=str; 00051 int32_t transposed_num_feat=0; 00052 int32_t transposed_num_vec=0; 00053 transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec); 00054 00055 string_length=str->get_max_vector_length(); 00056 num_strings=str->get_num_vectors(); 00057 ASSERT(transposed_num_feat==num_strings); 00058 ASSERT(transposed_num_vec==string_length); 00059 00060 CAlphabet* alpha=str->get_alphabet(); 00061 alphabet_size=alpha->get_num_symbols(); 00062 SG_UNREF(alpha); 00063 00064 degree=order; 00065 start_degree=start_order; 00066 if (start_degree!=0) 00067 SG_NOTIMPLEMENTED; 00068 from_degree=from_order; 00069 m_hash_bits=hash_bits; 00070 set_wd_weights(); 00071 set_normalization_const(); 00072 } 00073 00074 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig) 00075 : CDotFeatures(orig), strings(orig.strings), transposed_strings(transposed_strings), 00076 degree(orig.degree), start_degree(orig.start_degree), 00077 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits), 00078 normalization_const(orig.normalization_const) 00079 { 00080 SG_REF(strings); 00081 string_length=strings->get_max_vector_length(); 00082 num_strings=strings->get_num_vectors(); 00083 CAlphabet* alpha=strings->get_alphabet(); 00084 alphabet_size=alpha->get_num_symbols(); 00085 SG_UNREF(alpha); 00086 00087 set_wd_weights(); 00088 } 00089 00090 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed() 00091 { 00092 for (int32_t i=0; i<string_length; i++) 00093 delete[] transposed_strings[i].string; 00094 delete[] transposed_strings; 00095 00096 SG_UNREF(strings); 00097 delete[] wd_weights; 00098 } 00099 00100 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, int32_t vec_idx2) 00101 { 00102 int32_t len1, len2; 00103 bool free_vec1, free_vec2; 00104 00105 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00106 uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2); 00107 00108 ASSERT(len1==len2); 00109 00110 float64_t sum=0.0; 00111 00112 for (int32_t i=0; i<len1; i++) 00113 { 00114 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00115 { 00116 if (vec1[i+j]!=vec2[i+j]) 00117 break; 00118 if (j>=start_degree) 00119 sum += wd_weights[j]*wd_weights[j]; 00120 } 00121 } 00122 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00123 strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00124 return sum/CMath::sq(normalization_const); 00125 } 00126 00127 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00128 { 00129 if (vec2_len != w_dim) 00130 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00131 00132 float64_t sum=0; 00133 int32_t len; 00134 bool free_vec1; 00135 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00136 uint32_t* val=new uint32_t[len]; 00137 00138 uint32_t offs=0; 00139 00140 CMath::fill_vector(val, len, 0xDEADBEAF); 00141 00142 for (int32_t i=0; i < len; i++) 00143 { 00144 uint32_t o=offs; 00145 for (int32_t k=0; k<degree && i+k<len; k++) 00146 { 00147 const float64_t wd = wd_weights[k]; 00148 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00149 val[i]=h; 00150 #ifdef DEBUG_HASHEDWD 00151 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00152 #endif 00153 sum+=vec2[o+(h & mask)]*wd; 00154 o+=partial_w_dim; 00155 } 00156 offs+=partial_w_dim*degree; 00157 } 00158 delete[] val; 00159 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00160 00161 return sum/normalization_const; 00162 } 00163 00164 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00165 { 00166 ASSERT(output); 00167 // write access is internally between output[start..stop] so the following 00168 // line is necessary to write to output[0...(stop-start-1)] 00169 output-=start; 00170 ASSERT(start>=0); 00171 ASSERT(start<stop); 00172 ASSERT(stop<=get_num_vectors()); 00173 uint32_t* index=new uint32_t[stop]; 00174 00175 int32_t num_vectors=stop-start; 00176 ASSERT(num_vectors>0); 00177 00178 int32_t num_threads=parallel->get_num_threads(); 00179 ASSERT(num_threads>0); 00180 00181 CSignal::clear_cancel(); 00182 00183 if (dim != w_dim) 00184 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim); 00185 00186 #ifndef WIN32 00187 if (num_threads < 2) 00188 { 00189 #endif 00190 HASHEDWD_THREAD_PARAM params; 00191 params.hf=this; 00192 params.sub_index=NULL; 00193 params.output=output; 00194 params.start=start; 00195 params.stop=stop; 00196 params.alphas=alphas; 00197 params.vec=vec; 00198 params.bias=b; 00199 params.progress=false; //true; 00200 params.index=index; 00201 dense_dot_range_helper((void*) ¶ms); 00202 #ifndef WIN32 00203 } 00204 else 00205 { 00206 pthread_t* threads = new pthread_t[num_threads-1]; 00207 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads]; 00208 int32_t step= num_vectors/num_threads; 00209 00210 int32_t t; 00211 00212 for (t=0; t<num_threads-1; t++) 00213 { 00214 params[t].hf = this; 00215 params[t].sub_index=NULL; 00216 params[t].output = output; 00217 params[t].start = start+t*step; 00218 params[t].stop = start+(t+1)*step; 00219 params[t].alphas=alphas; 00220 params[t].vec=vec; 00221 params[t].bias=b; 00222 params[t].progress = false; 00223 params[t].index=index; 00224 pthread_create(&threads[t], NULL, 00225 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]); 00226 } 00227 00228 params[t].hf = this; 00229 params[t].sub_index=NULL; 00230 params[t].output = output; 00231 params[t].start = start+t*step; 00232 params[t].stop = stop; 00233 params[t].alphas=alphas; 00234 params[t].vec=vec; 00235 params[t].bias=b; 00236 params[t].progress = false; //true; 00237 params[t].index=index; 00238 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]); 00239 00240 for (t=0; t<num_threads-1; t++) 00241 pthread_join(threads[t], NULL); 00242 00243 delete[] params; 00244 delete[] threads; 00245 delete[] index; 00246 } 00247 #endif 00248 00249 #ifndef WIN32 00250 if ( CSignal::cancel_computations() ) 00251 SG_INFO( "prematurely stopped. \n"); 00252 #endif 00253 } 00254 00255 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00256 { 00257 ASSERT(sub_index); 00258 ASSERT(output); 00259 00260 uint32_t* index=new uint32_t[num]; 00261 00262 int32_t num_threads=parallel->get_num_threads(); 00263 ASSERT(num_threads>0); 00264 00265 CSignal::clear_cancel(); 00266 00267 if (dim != w_dim) 00268 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim); 00269 00270 #ifndef WIN32 00271 if (num_threads < 2) 00272 { 00273 #endif 00274 HASHEDWD_THREAD_PARAM params; 00275 params.hf=this; 00276 params.sub_index=sub_index; 00277 params.output=output; 00278 params.start=0; 00279 params.stop=num; 00280 params.alphas=alphas; 00281 params.vec=vec; 00282 params.bias=b; 00283 params.progress=false; //true; 00284 params.index=index; 00285 dense_dot_range_helper((void*) ¶ms); 00286 #ifndef WIN32 00287 } 00288 else 00289 { 00290 pthread_t* threads = new pthread_t[num_threads-1]; 00291 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads]; 00292 int32_t step= num/num_threads; 00293 00294 int32_t t; 00295 00296 for (t=0; t<num_threads-1; t++) 00297 { 00298 params[t].hf = this; 00299 params[t].sub_index=sub_index; 00300 params[t].output = output; 00301 params[t].start = t*step; 00302 params[t].stop = (t+1)*step; 00303 params[t].alphas=alphas; 00304 params[t].vec=vec; 00305 params[t].bias=b; 00306 params[t].progress = false; 00307 params[t].index=index; 00308 pthread_create(&threads[t], NULL, 00309 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]); 00310 } 00311 00312 params[t].hf = this; 00313 params[t].sub_index=sub_index; 00314 params[t].output = output; 00315 params[t].start = t*step; 00316 params[t].stop = num; 00317 params[t].alphas=alphas; 00318 params[t].vec=vec; 00319 params[t].bias=b; 00320 params[t].progress = false; //true; 00321 params[t].index=index; 00322 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]); 00323 00324 for (t=0; t<num_threads-1; t++) 00325 pthread_join(threads[t], NULL); 00326 00327 delete[] params; 00328 delete[] threads; 00329 delete[] index; 00330 } 00331 #endif 00332 00333 #ifndef WIN32 00334 if ( CSignal::cancel_computations() ) 00335 SG_INFO( "prematurely stopped. \n"); 00336 #endif 00337 } 00338 00339 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p) 00340 { 00341 HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p; 00342 CHashedWDFeaturesTransposed* hf=par->hf; 00343 int32_t* sub_index=par->sub_index; 00344 float64_t* output=par->output; 00345 int32_t start=par->start; 00346 int32_t stop=par->stop; 00347 float64_t* alphas=par->alphas; 00348 float64_t* vec=par->vec; 00349 float64_t bias=par->bias; 00350 bool progress=par->progress; 00351 uint32_t* index=par->index; 00352 int32_t string_length=hf->string_length; 00353 int32_t degree=hf->degree; 00354 float64_t* wd_weights=hf->wd_weights; 00355 T_STRING<uint8_t>* transposed_strings=hf->transposed_strings; 00356 uint32_t mask=hf->mask; 00357 int32_t partial_w_dim=hf->partial_w_dim; 00358 float64_t normalization_const=hf->normalization_const; 00359 00360 if (sub_index) 00361 { 00362 for (int32_t j=start; j<stop; j++) 00363 output[j]=0.0; 00364 00365 uint32_t offs=0; 00366 for (int32_t i=0; i<string_length; i++) 00367 { 00368 uint32_t o=offs; 00369 for (int32_t k=0; k<degree && i+k<string_length; k++) 00370 { 00371 const float64_t wd = wd_weights[k]; 00372 uint8_t* dim=transposed_strings[i+k].string; 00373 uint32_t h; 00374 00375 for (int32_t j=start; j<stop; j++) 00376 { 00377 uint8_t bval=dim[sub_index[j]]; 00378 if (k==0) 00379 h=CHash::IncrementalMurmurHash2(bval, 0xDEADBEAF); 00380 else 00381 h=CHash::IncrementalMurmurHash2(bval, index[j]); 00382 index[j]=h; 00383 output[j]+=vec[o + (h & mask)]*wd; 00384 } 00385 o+=partial_w_dim; 00386 } 00387 offs+=partial_w_dim*degree; 00388 00389 if (progress) 00390 hf->io->progress(i, 0,string_length, i); 00391 } 00392 00393 for (int32_t j=start; j<stop; j++) 00394 { 00395 if (alphas) 00396 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias; 00397 else 00398 output[j]=output[j]/normalization_const+bias; 00399 } 00400 } 00401 else 00402 { 00403 CMath::fill_vector(&output[start], stop-start, 0.0); 00404 00405 uint32_t offs=0; 00406 for (int32_t i=0; i<string_length; i++) 00407 { 00408 uint32_t o=offs; 00409 for (int32_t k=0; k<degree && i+k<string_length; k++) 00410 { 00411 const float64_t wd = wd_weights[k]; 00412 uint8_t* dim=transposed_strings[i+k].string; 00413 uint32_t h; 00414 00415 for (int32_t j=start; j<stop; j++) 00416 { 00417 if (k==0) 00418 h=CHash::IncrementalMurmurHash2(dim[j], 0xDEADBEAF); 00419 else 00420 h=CHash::IncrementalMurmurHash2(dim[j], index[j]); 00421 index[j]=h; 00422 output[j]+=vec[o + (h & mask)]*wd; 00423 } 00424 o+=partial_w_dim; 00425 } 00426 offs+=partial_w_dim*degree; 00427 00428 if (progress) 00429 hf->io->progress(i, 0,string_length, i); 00430 } 00431 00432 for (int32_t j=start; j<stop; j++) 00433 { 00434 if (alphas) 00435 output[j]=output[j]*alphas[j]/normalization_const+bias; 00436 else 00437 output[j]=output[j]/normalization_const+bias; 00438 } 00439 } 00440 00441 return NULL; 00442 } 00443 00444 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00445 { 00446 if (vec2_len != w_dim) 00447 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00448 00449 int32_t len; 00450 bool free_vec1; 00451 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00452 uint32_t* val=new uint32_t[len]; 00453 00454 uint32_t offs=0; 00455 float64_t factor=alpha/normalization_const; 00456 if (abs_val) 00457 factor=CMath::abs(factor); 00458 00459 CMath::fill_vector(val, len, 0xDEADBEAF); 00460 00461 for (int32_t i=0; i<len; i++) 00462 { 00463 uint32_t o=offs; 00464 for (int32_t k=0; k<degree && i+k<len; k++) 00465 { 00466 float64_t wd = wd_weights[k]*factor; 00467 00468 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00469 val[i]=h; 00470 00471 #ifdef DEBUG_HASHEDWD 00472 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h); 00473 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00474 #endif 00475 vec2[o+(h & mask)]+=wd; 00476 o+=partial_w_dim; 00477 } 00478 offs+=partial_w_dim*degree; 00479 } 00480 00481 delete[] val; 00482 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00483 } 00484 00485 void CHashedWDFeaturesTransposed::set_wd_weights() 00486 { 00487 ASSERT(degree>0); 00488 00489 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00490 partial_w_dim=1<<m_hash_bits; 00491 w_dim=partial_w_dim*string_length*(degree-start_degree); 00492 00493 wd_weights=new float64_t[degree]; 00494 00495 for (int32_t i=0; i<degree; i++) 00496 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00497 00498 SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, " 00499 "dim=%d partial_dim=%d num=%d, len=%d\n", 00500 degree, from_degree, alphabet_size, 00501 w_dim, partial_w_dim, num_strings, string_length); 00502 } 00503 00504 00505 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n) 00506 { 00507 if (n==0) 00508 { 00509 normalization_const=0; 00510 for (int32_t i=0; i<degree; i++) 00511 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00512 00513 normalization_const=CMath::sqrt(normalization_const); 00514 } 00515 else 00516 normalization_const=n; 00517 00518 SG_DEBUG("normalization_const:%f\n", normalization_const); 00519 } 00520 00521 CFeatures* CHashedWDFeaturesTransposed::duplicate() const 00522 { 00523 return new CHashedWDFeaturesTransposed(*this); 00524 }