|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "lib/common.h" 00012 #include "kernel/WeightedCommWordStringKernel.h" 00013 #include "features/StringFeatures.h" 00014 #include "lib/io.h" 00015 00016 using namespace shogun; 00017 00018 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel( 00019 int32_t size, bool us) 00020 : CCommWordStringKernel(size, us), degree(0), weights(NULL) 00021 { 00022 init_dictionary(1<<(sizeof(uint16_t)*9)); 00023 ASSERT(us==false); 00024 } 00025 00026 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel( 00027 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us, 00028 int32_t size) 00029 : CCommWordStringKernel(size, us), degree(0), weights(NULL) 00030 { 00031 init_dictionary(1<<(sizeof(uint16_t)*9)); 00032 ASSERT(us==false); 00033 00034 init(l,r); 00035 } 00036 00037 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel() 00038 { 00039 delete[] weights; 00040 } 00041 00042 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r) 00043 { 00044 ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() == 00045 ((CStringFeatures<uint16_t>*) r)->get_order()); 00046 degree=((CStringFeatures<uint16_t>*) l)->get_order(); 00047 set_wd_weights(); 00048 00049 CCommWordStringKernel::init(l,r); 00050 return init_normalizer(); 00051 } 00052 00053 void CWeightedCommWordStringKernel::cleanup() 00054 { 00055 delete[] weights; 00056 weights=NULL; 00057 00058 CCommWordStringKernel::cleanup(); 00059 } 00060 00061 bool CWeightedCommWordStringKernel::set_wd_weights() 00062 { 00063 delete[] weights; 00064 weights=new float64_t[degree]; 00065 00066 int32_t i; 00067 float64_t sum=0; 00068 for (i=0; i<degree; i++) 00069 { 00070 weights[i]=degree-i; 00071 sum+=weights[i]; 00072 } 00073 for (i=0; i<degree; i++) 00074 weights[i]=CMath::sqrt(weights[i]/sum); 00075 00076 return weights!=NULL; 00077 } 00078 00079 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d) 00080 { 00081 ASSERT(d==degree); 00082 00083 delete[] weights; 00084 weights=new float64_t[degree]; 00085 for (int32_t i=0; i<degree; i++) 00086 weights[i]=CMath::sqrt(w[i]); 00087 return true; 00088 } 00089 00090 float64_t CWeightedCommWordStringKernel::compute_helper( 00091 int32_t idx_a, int32_t idx_b, bool do_sort) 00092 { 00093 int32_t alen, blen; 00094 bool free_avec, free_bvec; 00095 00096 CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs; 00097 CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs; 00098 00099 uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec); 00100 uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec); 00101 00102 uint16_t* avec=av; 00103 uint16_t* bvec=bv; 00104 00105 if (do_sort) 00106 { 00107 if (alen>0) 00108 { 00109 avec=new uint16_t[alen]; 00110 memcpy(avec, av, sizeof(uint16_t)*alen); 00111 CMath::radix_sort(avec, alen); 00112 } 00113 else 00114 avec=NULL; 00115 00116 if (blen>0) 00117 { 00118 bvec=new uint16_t[blen]; 00119 memcpy(bvec, bv, sizeof(uint16_t)*blen); 00120 CMath::radix_sort(bvec, blen); 00121 } 00122 else 00123 bvec=NULL; 00124 } 00125 else 00126 { 00127 if ( (l->get_num_preproc() != l->get_num_preprocessed()) || 00128 (r->get_num_preproc() != r->get_num_preprocessed())) 00129 { 00130 SG_ERROR("not all preprocessors have been applied to training (%d/%d)" 00131 " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(), 00132 r->get_num_preprocessed(), r->get_num_preproc()); 00133 } 00134 } 00135 00136 float64_t result=0; 00137 uint8_t mask=0; 00138 00139 for (int32_t d=0; d<degree; d++) 00140 { 00141 mask = mask | (1 << (degree-d-1)); 00142 uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask); 00143 00144 int32_t left_idx=0; 00145 int32_t right_idx=0; 00146 float64_t weight=weights[d]*weights[d]; 00147 00148 while (left_idx < alen && right_idx < blen) 00149 { 00150 uint16_t lsym=avec[left_idx] & masked; 00151 uint16_t rsym=bvec[right_idx] & masked; 00152 00153 if (lsym == rsym) 00154 { 00155 int32_t old_left_idx=left_idx; 00156 int32_t old_right_idx=right_idx; 00157 00158 while (left_idx<alen && (avec[left_idx] & masked) ==lsym) 00159 left_idx++; 00160 00161 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym) 00162 right_idx++; 00163 00164 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx); 00165 } 00166 else if (lsym<rsym) 00167 left_idx++; 00168 else 00169 right_idx++; 00170 } 00171 } 00172 00173 if (do_sort) 00174 { 00175 delete[] avec; 00176 delete[] bvec; 00177 } 00178 00179 l->free_feature_vector(av, idx_a, free_avec); 00180 r->free_feature_vector(bv, idx_b, free_bvec); 00181 00182 return result; 00183 } 00184 00185 void CWeightedCommWordStringKernel::add_to_normal( 00186 int32_t vec_idx, float64_t weight) 00187 { 00188 int32_t len=-1; 00189 bool free_vec; 00190 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs; 00191 uint16_t* vec=s->get_feature_vector(vec_idx, len, free_vec); 00192 00193 if (len>0) 00194 { 00195 for (int32_t j=0; j<len; j++) 00196 { 00197 uint8_t mask=0; 00198 int32_t offs=0; 00199 for (int32_t d=0; d<degree; d++) 00200 { 00201 mask = mask | (1 << (degree-d-1)); 00202 int32_t idx=s->get_masked_symbols(vec[j], mask); 00203 idx=s->shift_symbol(idx, degree-d-1); 00204 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx); 00205 offs+=s->shift_offset(1,d+1); 00206 } 00207 } 00208 00209 set_is_initialized(true); 00210 } 00211 00212 s->free_feature_vector(vec, vec_idx, free_vec); 00213 } 00214 00215 void CWeightedCommWordStringKernel::merge_normal() 00216 { 00217 ASSERT(get_is_initialized()); 00218 ASSERT(use_sign==false); 00219 00220 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs; 00221 uint32_t num_symbols=(uint32_t) s->get_num_symbols(); 00222 int32_t dic_size=1<<(sizeof(uint16_t)*8); 00223 float64_t* dic=new float64_t[dic_size]; 00224 memset(dic, 0, sizeof(float64_t)*dic_size); 00225 00226 for (uint32_t sym=0; sym<num_symbols; sym++) 00227 { 00228 float64_t result=0; 00229 uint8_t mask=0; 00230 int32_t offs=0; 00231 for (int32_t d=0; d<degree; d++) 00232 { 00233 mask = mask | (1 << (degree-d-1)); 00234 int32_t idx=s->get_masked_symbols(sym, mask); 00235 idx=s->shift_symbol(idx, degree-d-1); 00236 result += dictionary_weights[offs + idx]; 00237 offs+=s->shift_offset(1,d+1); 00238 } 00239 dic[sym]=result; 00240 } 00241 00242 init_dictionary(1<<(sizeof(uint16_t)*8)); 00243 memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size); 00244 delete[] dic; 00245 } 00246 00247 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i) 00248 { 00249 if (!get_is_initialized()) 00250 SG_ERROR( "CCommWordStringKernel optimization not initialized\n"); 00251 00252 ASSERT(use_sign==false); 00253 00254 float64_t result=0; 00255 bool free_vec; 00256 int32_t len=-1; 00257 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs; 00258 uint16_t* vec=s->get_feature_vector(i, len, free_vec); 00259 00260 if (vec && len>0) 00261 { 00262 for (int32_t j=0; j<len; j++) 00263 { 00264 uint8_t mask=0; 00265 int32_t offs=0; 00266 for (int32_t d=0; d<degree; d++) 00267 { 00268 mask = mask | (1 << (degree-d-1)); 00269 int32_t idx=s->get_masked_symbols(vec[j], mask); 00270 idx=s->shift_symbol(idx, degree-d-1); 00271 result += dictionary_weights[offs + idx]*weights[d]; 00272 offs+=s->shift_offset(1,d+1); 00273 } 00274 } 00275 00276 result=normalizer->normalize_rhs(result, i); 00277 } 00278 s->free_feature_vector(vec, i, free_vec); 00279 return result; 00280 } 00281 00282 float64_t* CWeightedCommWordStringKernel::compute_scoring( 00283 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target, 00284 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init) 00285 { 00286 if (do_init) 00287 CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas); 00288 00289 int32_t dic_size=1<<(sizeof(uint16_t)*9); 00290 float64_t* dic=new float64_t[dic_size]; 00291 memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size); 00292 00293 merge_normal(); 00294 float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat, 00295 num_sym, target, num_suppvec, IDX, alphas, false); 00296 00297 init_dictionary(1<<(sizeof(uint16_t)*9)); 00298 memcpy(dictionary_weights,dic, sizeof(float64_t)*dic_size); 00299 delete[] dic; 00300 00301 return result; 00302 }