|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "lib/common.h" 00013 #include "lib/io.h" 00014 #include "lib/Signal.h" 00015 #include "lib/Trie.h" 00016 #include "base/Parallel.h" 00017 00018 #include "kernel/WeightedDegreeStringKernel.h" 00019 #include "kernel/FirstElementKernelNormalizer.h" 00020 #include "features/Features.h" 00021 #include "features/StringFeatures.h" 00022 00023 #ifndef WIN32 00024 #include <pthread.h> 00025 #endif 00026 00027 00028 #ifdef HAVE_BOOST_SERIALIZATION 00029 #include <boost/serialization/export.hpp> 00030 BOOST_CLASS_EXPORT(shogun::CWeightedDegreeStringKernel); 00031 #endif //HAVE_BOOST_SERIALIZATION 00032 00033 00034 using namespace shogun; 00035 00036 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00037 struct S_THREAD_PARAM 00038 { 00039 00040 int32_t* vec; 00041 float64_t* result; 00042 float64_t* weights; 00043 CWeightedDegreeStringKernel* kernel; 00044 CTrie<DNATrie>* tries; 00045 float64_t factor; 00046 int32_t j; 00047 int32_t start; 00048 int32_t end; 00049 int32_t length; 00050 int32_t* vec_idx; 00051 }; 00052 #endif // DOXYGEN_SHOULD_SKIP_THIS 00053 00054 00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ( 00056 int32_t degree_, EWDKernType type_) 00057 : CStringKernel<char>(10),weights(NULL),position_weights(NULL), 00058 weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0), 00059 max_mismatch(0), seq_length(0), block_computation(true), 00060 num_block_weights_external(0), block_weights_external(NULL), 00061 block_weights(NULL), type(type_), which_degree(-1), tries(NULL), 00062 tree_initialized(false), alphabet(NULL) 00063 { 00064 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION; 00065 lhs=NULL; 00066 rhs=NULL; 00067 00068 if (type!=E_EXTERNAL) 00069 set_wd_weights_by_type(type); 00070 00071 set_normalizer(new CFirstElementKernelNormalizer()); 00072 } 00073 00074 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ( 00075 float64_t *weights_, int32_t degree_) 00076 : CStringKernel<char>(10), weights(NULL), position_weights(NULL), 00077 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0), 00078 max_mismatch(0), seq_length(0), block_computation(true), 00079 num_block_weights_external(0), block_weights_external(NULL), 00080 block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL), 00081 tree_initialized(false), alphabet(NULL) 00082 { 00083 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION; 00084 lhs=NULL; 00085 rhs=NULL; 00086 00087 weights=new float64_t[degree*(1+max_mismatch)]; 00088 for (int32_t i=0; i<degree*(1+max_mismatch); i++) 00089 weights[i]=weights_[i]; 00090 set_normalizer(new CFirstElementKernelNormalizer()); 00091 } 00092 00093 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel( 00094 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_) 00095 : CStringKernel<char>(10), weights(NULL), position_weights(NULL), 00096 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0), 00097 max_mismatch(0), seq_length(0), block_computation(true), 00098 num_block_weights_external(0), block_weights_external(NULL), 00099 block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL), 00100 tree_initialized(false), alphabet(NULL) 00101 { 00102 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION; 00103 00104 set_wd_weights_by_type(type); 00105 set_normalizer(new CFirstElementKernelNormalizer()); 00106 init(l, r); 00107 } 00108 00109 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel() 00110 { 00111 cleanup(); 00112 00113 delete[] weights; 00114 weights=NULL; 00115 00116 delete[] block_weights; 00117 block_weights=NULL; 00118 00119 delete[] position_weights; 00120 position_weights=NULL; 00121 00122 delete[] weights_buffer; 00123 weights_buffer=NULL; 00124 } 00125 00126 00127 void CWeightedDegreeStringKernel::remove_lhs() 00128 { 00129 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n"); 00130 delete_optimization(); 00131 00132 if (tries!=NULL) 00133 tries->destroy(); 00134 00135 CKernel::remove_lhs(); 00136 } 00137 00138 void CWeightedDegreeStringKernel::create_empty_tries() 00139 { 00140 ASSERT(lhs); 00141 00142 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length(); 00143 00144 if (tries!=NULL) 00145 { 00146 tries->destroy() ; 00147 tries->create(seq_length, max_mismatch==0) ; 00148 } 00149 } 00150 00151 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r) 00152 { 00153 int32_t lhs_changed=(lhs!=l); 00154 int32_t rhs_changed=(rhs!=r); 00155 00156 CStringKernel<char>::init(l,r); 00157 00158 SG_DEBUG("lhs_changed: %i\n", lhs_changed); 00159 SG_DEBUG("rhs_changed: %i\n", rhs_changed); 00160 00161 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00162 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00163 00164 int32_t len=sf_l->get_max_vector_length(); 00165 if (lhs_changed && !sf_l->have_same_length(len)) 00166 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n"); 00167 00168 if (rhs_changed && !sf_r->have_same_length(len)) 00169 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n"); 00170 00171 SG_UNREF(alphabet); 00172 alphabet=sf_l->get_alphabet(); 00173 CAlphabet* ralphabet=sf_r->get_alphabet(); 00174 00175 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00176 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00177 00178 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()); 00179 SG_UNREF(ralphabet); 00180 00181 if (tries!=NULL) { 00182 tries->delete_trees(max_mismatch==0); 00183 SG_UNREF(tries); 00184 } 00185 tries=new CTrie<DNATrie>(degree, max_mismatch==0); 00186 create_empty_tries(); 00187 00188 init_block_weights(); 00189 00190 return init_normalizer(); 00191 } 00192 00193 void CWeightedDegreeStringKernel::cleanup() 00194 { 00195 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n"); 00196 delete_optimization(); 00197 00198 delete[] block_weights; 00199 block_weights=NULL; 00200 00201 if (tries!=NULL) 00202 { 00203 tries->destroy(); 00204 SG_UNREF(tries); 00205 tries=NULL; 00206 } 00207 00208 seq_length=0; 00209 tree_initialized = false; 00210 00211 SG_UNREF(alphabet); 00212 alphabet=NULL; 00213 00214 CKernel::cleanup(); 00215 } 00216 00217 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num) 00218 { 00219 if (tree_num<0) 00220 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n"); 00221 00222 delete_optimization(); 00223 00224 if (tree_num<0) 00225 SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ; 00226 00227 for (int32_t i=0; i<count; i++) 00228 { 00229 if (tree_num<0) 00230 { 00231 if ( (i % (count/10+1)) == 0) 00232 SG_PROGRESS(i, 0, count); 00233 00234 if (max_mismatch==0) 00235 add_example_to_tree(IDX[i], alphas[i]) ; 00236 else 00237 add_example_to_tree_mismatch(IDX[i], alphas[i]) ; 00238 00239 //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ; 00240 } 00241 else 00242 { 00243 if (max_mismatch==0) 00244 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ; 00245 else 00246 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ; 00247 } 00248 } 00249 00250 if (tree_num<0) 00251 SG_DONE(); 00252 00253 //tries.compact_nodes(NO_CHILD, 0, weights) ; 00254 00255 set_is_initialized(true) ; 00256 return true ; 00257 } 00258 00259 bool CWeightedDegreeStringKernel::delete_optimization() 00260 { 00261 if (get_is_initialized()) 00262 { 00263 if (tries!=NULL) 00264 tries->delete_trees(max_mismatch==0); 00265 set_is_initialized(false); 00266 return true; 00267 } 00268 00269 return false; 00270 } 00271 00272 00273 float64_t CWeightedDegreeStringKernel::compute_with_mismatch( 00274 char* avec, int32_t alen, char* bvec, int32_t blen) 00275 { 00276 float64_t sum = 0.0; 00277 00278 for (int32_t i=0; i<alen; i++) 00279 { 00280 float64_t sumi = 0.0; 00281 int32_t mismatches=0; 00282 00283 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00284 { 00285 if (avec[i+j]!=bvec[i+j]) 00286 { 00287 mismatches++ ; 00288 if (mismatches>max_mismatch) 00289 break ; 00290 } ; 00291 sumi += weights[j+degree*mismatches]; 00292 } 00293 if (position_weights!=NULL) 00294 sum+=position_weights[i]*sumi ; 00295 else 00296 sum+=sumi ; 00297 } 00298 return sum ; 00299 } 00300 00301 float64_t CWeightedDegreeStringKernel::compute_using_block( 00302 char* avec, int32_t alen, char* bvec, int32_t blen) 00303 { 00304 ASSERT(alen==blen); 00305 00306 float64_t sum=0; 00307 int32_t match_len=-1; 00308 00309 for (int32_t i=0; i<alen; i++) 00310 { 00311 if (avec[i]==bvec[i]) 00312 match_len++; 00313 else 00314 { 00315 if (match_len>=0) 00316 sum+=block_weights[match_len]; 00317 match_len=-1; 00318 } 00319 } 00320 00321 if (match_len>=0) 00322 sum+=block_weights[match_len]; 00323 00324 return sum; 00325 } 00326 00327 float64_t CWeightedDegreeStringKernel::compute_without_mismatch( 00328 char* avec, int32_t alen, char* bvec, int32_t blen) 00329 { 00330 float64_t sum = 0.0; 00331 00332 for (int32_t i=0; i<alen; i++) 00333 { 00334 float64_t sumi = 0.0; 00335 00336 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00337 { 00338 if (avec[i+j]!=bvec[i+j]) 00339 break ; 00340 sumi += weights[j]; 00341 } 00342 if (position_weights!=NULL) 00343 sum+=position_weights[i]*sumi ; 00344 else 00345 sum+=sumi ; 00346 } 00347 return sum ; 00348 } 00349 00350 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix( 00351 char* avec, int32_t alen, char* bvec, int32_t blen) 00352 { 00353 float64_t sum = 0.0; 00354 00355 for (int32_t i=0; i<alen; i++) 00356 { 00357 float64_t sumi=0.0; 00358 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00359 { 00360 if (avec[i+j]!=bvec[i+j]) 00361 break; 00362 sumi += weights[i*degree+j]; 00363 } 00364 if (position_weights!=NULL) 00365 sum += position_weights[i]*sumi ; 00366 else 00367 sum += sumi ; 00368 } 00369 00370 return sum ; 00371 } 00372 00373 00374 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b) 00375 { 00376 int32_t alen, blen; 00377 bool free_avec, free_bvec; 00378 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00379 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00380 float64_t result=0; 00381 00382 if (max_mismatch==0 && length==0 && block_computation) 00383 result=compute_using_block(avec, alen, bvec, blen); 00384 else 00385 { 00386 if (max_mismatch>0) 00387 result=compute_with_mismatch(avec, alen, bvec, blen); 00388 else if (length==0) 00389 result=compute_without_mismatch(avec, alen, bvec, blen); 00390 else 00391 result=compute_without_mismatch_matrix(avec, alen, bvec, blen); 00392 } 00393 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00394 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00395 00396 return result; 00397 } 00398 00399 00400 void CWeightedDegreeStringKernel::add_example_to_tree( 00401 int32_t idx, float64_t alpha) 00402 { 00403 ASSERT(alphabet); 00404 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00405 00406 int32_t len=0; 00407 bool free_vec; 00408 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00409 ASSERT(max_mismatch==0); 00410 int32_t *vec=new int32_t[len]; 00411 00412 for (int32_t i=0; i<len; i++) 00413 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00414 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00415 00416 if (length == 0 || max_mismatch > 0) 00417 { 00418 for (int32_t i=0; i<len; i++) 00419 { 00420 float64_t alpha_pw=alpha; 00421 /*if (position_weights!=NULL) 00422 alpha_pw *= position_weights[i] ;*/ 00423 if (alpha_pw==0.0) 00424 continue; 00425 ASSERT(tries); 00426 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0)); 00427 } 00428 } 00429 else 00430 { 00431 for (int32_t i=0; i<len; i++) 00432 { 00433 float64_t alpha_pw=alpha; 00434 /*if (position_weights!=NULL) 00435 alpha_pw = alpha*position_weights[i] ;*/ 00436 if (alpha_pw==0.0) 00437 continue ; 00438 ASSERT(tries); 00439 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0)); 00440 } 00441 } 00442 delete[] vec ; 00443 tree_initialized=true ; 00444 } 00445 00446 void CWeightedDegreeStringKernel::add_example_to_single_tree( 00447 int32_t idx, float64_t alpha, int32_t tree_num) 00448 { 00449 ASSERT(alphabet); 00450 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00451 00452 int32_t len; 00453 bool free_vec; 00454 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00455 ASSERT(max_mismatch==0); 00456 int32_t *vec = new int32_t[len] ; 00457 00458 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++) 00459 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00460 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00461 00462 00463 ASSERT(tries); 00464 if (alpha!=0.0) 00465 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0)); 00466 00467 delete[] vec ; 00468 tree_initialized=true ; 00469 } 00470 00471 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha) 00472 { 00473 ASSERT(tries); 00474 ASSERT(alphabet); 00475 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00476 00477 int32_t len ; 00478 bool free_vec; 00479 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00480 00481 int32_t *vec = new int32_t[len] ; 00482 00483 for (int32_t i=0; i<len; i++) 00484 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00485 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00486 00487 for (int32_t i=0; i<len; i++) 00488 { 00489 if (alpha!=0.0) 00490 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights); 00491 } 00492 00493 delete[] vec ; 00494 tree_initialized=true ; 00495 } 00496 00497 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch( 00498 int32_t idx, float64_t alpha, int32_t tree_num) 00499 { 00500 ASSERT(tries); 00501 ASSERT(alphabet); 00502 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00503 00504 int32_t len=0; 00505 bool free_vec; 00506 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00507 int32_t *vec=new int32_t[len]; 00508 00509 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++) 00510 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00511 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00512 00513 if (alpha!=0.0) 00514 { 00515 tries->add_example_to_tree_mismatch_recursion( 00516 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num, 00517 0, 0, max_mismatch, weights); 00518 } 00519 00520 delete[] vec; 00521 tree_initialized=true; 00522 } 00523 00524 00525 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx) 00526 { 00527 ASSERT(alphabet); 00528 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00529 00530 int32_t len=0; 00531 bool free_vec; 00532 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec); 00533 ASSERT(char_vec && len>0); 00534 int32_t *vec=new int32_t[len]; 00535 00536 for (int32_t i=0; i<len; i++) 00537 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00538 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00539 00540 float64_t sum=0; 00541 ASSERT(tries); 00542 for (int32_t i=0; i<len; i++) 00543 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)); 00544 00545 delete[] vec; 00546 return normalizer->normalize_rhs(sum, idx); 00547 } 00548 00549 void CWeightedDegreeStringKernel::compute_by_tree( 00550 int32_t idx, float64_t* LevelContrib) 00551 { 00552 ASSERT(alphabet); 00553 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00554 00555 int32_t len ; 00556 bool free_vec; 00557 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec); 00558 00559 int32_t *vec = new int32_t[len] ; 00560 00561 for (int32_t i=0; i<len; i++) 00562 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00563 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00564 00565 ASSERT(tries); 00566 for (int32_t i=0; i<len; i++) 00567 { 00568 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib, 00569 normalizer->normalize_rhs(1.0, idx), 00570 mkl_stepsize, weights, (length!=0)); 00571 } 00572 00573 delete[] vec ; 00574 } 00575 00576 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len) 00577 { 00578 ASSERT(tries); 00579 return tries->compute_abs_weights(len); 00580 } 00581 00582 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type) 00583 { 00584 ASSERT(degree>0); 00585 ASSERT(p_type==E_WD); 00586 00587 delete[] weights; 00588 weights=new float64_t[degree]; 00589 if (weights) 00590 { 00591 int32_t i; 00592 float64_t sum=0; 00593 for (i=0; i<degree; i++) 00594 { 00595 weights[i]=degree-i; 00596 sum+=weights[i]; 00597 } 00598 for (i=0; i<degree; i++) 00599 weights[i]/=sum; 00600 00601 for (i=0; i<degree; i++) 00602 { 00603 for (int32_t j=1; j<=max_mismatch; j++) 00604 { 00605 if (j<i+1) 00606 { 00607 int32_t nk=CMath::nchoosek(i+1, j); 00608 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j)); 00609 } 00610 else 00611 weights[i+j*degree]= 0; 00612 } 00613 } 00614 00615 if (which_degree>=0) 00616 { 00617 ASSERT(which_degree<degree); 00618 for (i=0; i<degree; i++) 00619 { 00620 if (i!=which_degree) 00621 weights[i]=0; 00622 else 00623 weights[i]=1; 00624 } 00625 } 00626 return true; 00627 } 00628 else 00629 return false; 00630 } 00631 00632 bool CWeightedDegreeStringKernel::set_weights( 00633 float64_t* ws, int32_t d, int32_t len) 00634 { 00635 if (d!=degree || len<0) 00636 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree); 00637 00638 length=len; 00639 00640 if (length==0) 00641 length=1; 00642 00643 int32_t num_weights=degree*(length+max_mismatch); 00644 delete[] weights; 00645 weights=new float64_t[num_weights]; 00646 00647 if (weights) 00648 { 00649 for (int32_t i=0; i<num_weights; i++) { 00650 if (ws[i]) // len(ws) might be != num_weights? 00651 weights[i]=ws[i]; 00652 } 00653 return true; 00654 } 00655 else 00656 return false; 00657 } 00658 00659 bool CWeightedDegreeStringKernel::set_position_weights( 00660 float64_t* pws, int32_t len) 00661 { 00662 if (len==0) 00663 { 00664 delete[] position_weights; 00665 position_weights=NULL; 00666 ASSERT(tries); 00667 tries->set_position_weights(position_weights); 00668 } 00669 00670 if (seq_length!=len) 00671 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len); 00672 00673 delete[] position_weights; 00674 position_weights=new float64_t[len]; 00675 ASSERT(tries); 00676 tries->set_position_weights(position_weights); 00677 00678 if (position_weights) 00679 { 00680 for (int32_t i=0; i<len; i++) 00681 position_weights[i]=pws[i]; 00682 return true; 00683 } 00684 else 00685 return false; 00686 } 00687 00688 bool CWeightedDegreeStringKernel::init_block_weights_from_wd() 00689 { 00690 delete[] block_weights; 00691 block_weights=new float64_t[CMath::max(seq_length,degree)]; 00692 00693 if (block_weights) 00694 { 00695 int32_t k; 00696 float64_t d=degree; // use float to evade rounding errors below 00697 00698 for (k=0; k<degree; k++) 00699 block_weights[k]= 00700 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1)); 00701 for (k=degree; k<seq_length; k++) 00702 block_weights[k]=(-d+3*k+4)/3; 00703 } 00704 00705 return (block_weights!=NULL); 00706 } 00707 00708 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external() 00709 { 00710 ASSERT(weights); 00711 delete[] block_weights; 00712 block_weights=new float64_t[CMath::max(seq_length,degree)]; 00713 00714 if (block_weights) 00715 { 00716 int32_t i=0; 00717 block_weights[0]=weights[0]; 00718 for (i=1; i<CMath::max(seq_length,degree); i++) 00719 block_weights[i]=0; 00720 00721 for (i=1; i<CMath::max(seq_length,degree); i++) 00722 { 00723 block_weights[i]=block_weights[i-1]; 00724 00725 float64_t contrib=0; 00726 for (int32_t j=0; j<CMath::min(degree,i+1); j++) 00727 contrib+=weights[j]; 00728 00729 block_weights[i]+=contrib; 00730 } 00731 } 00732 00733 return (block_weights!=NULL); 00734 } 00735 00736 bool CWeightedDegreeStringKernel::init_block_weights_const() 00737 { 00738 delete[] block_weights; 00739 block_weights=new float64_t[seq_length]; 00740 00741 if (block_weights) 00742 { 00743 for (int32_t i=1; i<seq_length+1 ; i++) 00744 block_weights[i-1]=1.0/seq_length; 00745 } 00746 00747 return (block_weights!=NULL); 00748 } 00749 00750 bool CWeightedDegreeStringKernel::init_block_weights_linear() 00751 { 00752 delete[] block_weights; 00753 block_weights=new float64_t[seq_length]; 00754 00755 if (block_weights) 00756 { 00757 for (int32_t i=1; i<seq_length+1 ; i++) 00758 block_weights[i-1]=degree*i; 00759 } 00760 00761 return (block_weights!=NULL); 00762 } 00763 00764 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly() 00765 { 00766 delete[] block_weights; 00767 block_weights=new float64_t[seq_length]; 00768 00769 if (block_weights) 00770 { 00771 for (int32_t i=1; i<degree+1 ; i++) 00772 block_weights[i-1]=((float64_t) i)*i; 00773 00774 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00775 block_weights[i-1]=i; 00776 } 00777 00778 return (block_weights!=NULL); 00779 } 00780 00781 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly() 00782 { 00783 delete[] block_weights; 00784 block_weights=new float64_t[seq_length]; 00785 00786 if (block_weights) 00787 { 00788 for (int32_t i=1; i<degree+1 ; i++) 00789 block_weights[i-1]=((float64_t) i)*i*i; 00790 00791 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00792 block_weights[i-1]=i; 00793 } 00794 00795 return (block_weights!=NULL); 00796 } 00797 00798 bool CWeightedDegreeStringKernel::init_block_weights_exp() 00799 { 00800 delete[] block_weights; 00801 block_weights=new float64_t[seq_length]; 00802 00803 if (block_weights) 00804 { 00805 for (int32_t i=1; i<degree+1 ; i++) 00806 block_weights[i-1]=exp(((float64_t) i/10.0)); 00807 00808 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00809 block_weights[i-1]=i; 00810 } 00811 00812 return (block_weights!=NULL); 00813 } 00814 00815 bool CWeightedDegreeStringKernel::init_block_weights_log() 00816 { 00817 delete[] block_weights; 00818 block_weights=new float64_t[seq_length]; 00819 00820 if (block_weights) 00821 { 00822 for (int32_t i=1; i<degree+1 ; i++) 00823 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2); 00824 00825 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00826 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2); 00827 } 00828 00829 return (block_weights!=NULL); 00830 } 00831 00832 bool CWeightedDegreeStringKernel::init_block_weights_external() 00833 { 00834 if (block_weights_external && (seq_length == num_block_weights_external) ) 00835 { 00836 delete[] block_weights; 00837 block_weights=new float64_t[seq_length]; 00838 00839 if (block_weights) 00840 { 00841 for (int32_t i=0; i<seq_length; i++) 00842 block_weights[i]=block_weights_external[i]; 00843 } 00844 } 00845 else { 00846 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external); 00847 } 00848 return (block_weights!=NULL); 00849 } 00850 00851 bool CWeightedDegreeStringKernel::init_block_weights() 00852 { 00853 switch (type) 00854 { 00855 case E_WD: 00856 return init_block_weights_from_wd(); 00857 case E_EXTERNAL: 00858 return init_block_weights_from_wd_external(); 00859 case E_BLOCK_CONST: 00860 return init_block_weights_const(); 00861 case E_BLOCK_LINEAR: 00862 return init_block_weights_linear(); 00863 case E_BLOCK_SQPOLY: 00864 return init_block_weights_sqpoly(); 00865 case E_BLOCK_CUBICPOLY: 00866 return init_block_weights_cubicpoly(); 00867 case E_BLOCK_EXP: 00868 return init_block_weights_exp(); 00869 case E_BLOCK_LOG: 00870 return init_block_weights_log(); 00871 case E_BLOCK_EXTERNAL: 00872 return init_block_weights_external(); 00873 default: 00874 return false; 00875 }; 00876 } 00877 00878 00879 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p) 00880 { 00881 S_THREAD_PARAM* params = (S_THREAD_PARAM*) p; 00882 int32_t j=params->j; 00883 CWeightedDegreeStringKernel* wd=params->kernel; 00884 CTrie<DNATrie>* tries=params->tries; 00885 float64_t* weights=params->weights; 00886 int32_t length=params->length; 00887 int32_t* vec=params->vec; 00888 float64_t* result=params->result; 00889 float64_t factor=params->factor; 00890 int32_t* vec_idx=params->vec_idx; 00891 00892 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs()); 00893 CAlphabet* alpha=wd->alphabet; 00894 00895 for (int32_t i=params->start; i<params->end; i++) 00896 { 00897 int32_t len=0; 00898 bool free_vec; 00899 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec); 00900 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++) 00901 vec[k]=alpha->remap_to_bin(char_vec[k]); 00902 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec); 00903 00904 ASSERT(tries); 00905 00906 result[i]+=factor* 00907 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]); 00908 } 00909 00910 SG_UNREF(rhs_feat); 00911 00912 return NULL; 00913 } 00914 00915 void CWeightedDegreeStringKernel::compute_batch( 00916 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec, 00917 int32_t* IDX, float64_t* alphas, float64_t factor) 00918 { 00919 ASSERT(tries); 00920 ASSERT(alphabet); 00921 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA); 00922 ASSERT(rhs); 00923 ASSERT(num_vec<=rhs->get_num_vectors()); 00924 ASSERT(num_vec>0); 00925 ASSERT(vec_idx); 00926 ASSERT(result); 00927 create_empty_tries(); 00928 00929 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length(); 00930 ASSERT(num_feat>0); 00931 int32_t num_threads=parallel->get_num_threads(); 00932 ASSERT(num_threads>0); 00933 int32_t* vec=new int32_t[num_threads*num_feat]; 00934 00935 if (num_threads < 2) 00936 { 00937 #ifdef CYGWIN 00938 for (int32_t j=0; j<num_feat; j++) 00939 #else 00940 CSignal::clear_cancel(); 00941 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++) 00942 #endif 00943 { 00944 init_optimization(num_suppvec, IDX, alphas, j); 00945 S_THREAD_PARAM params; 00946 params.vec=vec; 00947 params.result=result; 00948 params.weights=weights; 00949 params.kernel=this; 00950 params.tries=tries; 00951 params.factor=factor; 00952 params.j=j; 00953 params.start=0; 00954 params.end=num_vec; 00955 params.length=length; 00956 params.vec_idx=vec_idx; 00957 compute_batch_helper((void*) ¶ms); 00958 00959 SG_PROGRESS(j,0,num_feat); 00960 } 00961 } 00962 #ifndef WIN32 00963 else 00964 { 00965 CSignal::clear_cancel(); 00966 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++) 00967 { 00968 init_optimization(num_suppvec, IDX, alphas, j); 00969 pthread_t* threads = new pthread_t[num_threads-1]; 00970 S_THREAD_PARAM* params = new S_THREAD_PARAM[num_threads]; 00971 int32_t step= num_vec/num_threads; 00972 int32_t t; 00973 00974 for (t=0; t<num_threads-1; t++) 00975 { 00976 params[t].vec=&vec[num_feat*t]; 00977 params[t].result=result; 00978 params[t].weights=weights; 00979 params[t].kernel=this; 00980 params[t].tries=tries; 00981 params[t].factor=factor; 00982 params[t].j=j; 00983 params[t].start = t*step; 00984 params[t].end = (t+1)*step; 00985 params[t].length=length; 00986 params[t].vec_idx=vec_idx; 00987 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]); 00988 } 00989 params[t].vec=&vec[num_feat*t]; 00990 params[t].result=result; 00991 params[t].weights=weights; 00992 params[t].kernel=this; 00993 params[t].tries=tries; 00994 params[t].factor=factor; 00995 params[t].j=j; 00996 params[t].start=t*step; 00997 params[t].end=num_vec; 00998 params[t].length=length; 00999 params[t].vec_idx=vec_idx; 01000 compute_batch_helper((void*) ¶ms[t]); 01001 01002 for (t=0; t<num_threads-1; t++) 01003 pthread_join(threads[t], NULL); 01004 SG_PROGRESS(j,0,num_feat); 01005 01006 delete[] params; 01007 delete[] threads; 01008 } 01009 } 01010 #endif 01011 01012 delete[] vec; 01013 01014 //really also free memory as this can be huge on testing especially when 01015 //using the combined kernel 01016 create_empty_tries(); 01017 } 01018 01019 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max) 01020 { 01021 if (type==E_EXTERNAL && max!=0) { 01022 return false; 01023 } 01024 01025 max_mismatch=max; 01026 01027 if (lhs!=NULL && rhs!=NULL) 01028 return init(lhs, rhs); 01029 else 01030 return true; 01031 }