|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Gunnar Raetsch 00008 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "lib/common.h" 00012 #include "lib/io.h" 00013 #include "kernel/SalzbergWordStringKernel.h" 00014 #include "features/Features.h" 00015 #include "features/StringFeatures.h" 00016 #include "features/Labels.h" 00017 #include "classifier/PluginEstimate.h" 00018 00019 using namespace shogun; 00020 00021 CSalzbergWordStringKernel::CSalzbergWordStringKernel(int32_t size, CPluginEstimate* pie, CLabels* labels) 00022 : CStringKernel<uint16_t>(size), estimate(pie), mean(NULL), variance(NULL), 00023 sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL), 00024 ld_mean_lhs(NULL), ld_mean_rhs(NULL), 00025 num_params(0), num_symbols(0), sum_m2_s2(0), pos_prior(0.5), 00026 neg_prior(0.5), initialized(false) 00027 { 00028 if (labels) 00029 set_prior_probs_from_labels(labels); 00030 } 00031 00032 CSalzbergWordStringKernel::CSalzbergWordStringKernel( 00033 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, 00034 CPluginEstimate* pie, CLabels* labels) 00035 : CStringKernel<uint16_t>(10),estimate(pie), mean(NULL), variance(NULL), 00036 sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL), 00037 ld_mean_lhs(NULL), ld_mean_rhs(NULL), 00038 num_params(0), num_symbols(0), sum_m2_s2(0), pos_prior(0.5), 00039 neg_prior(0.5), initialized(false) 00040 { 00041 if (labels) 00042 set_prior_probs_from_labels(labels); 00043 00044 init(l, r); 00045 } 00046 00047 CSalzbergWordStringKernel::~CSalzbergWordStringKernel() 00048 { 00049 cleanup(); 00050 } 00051 00052 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r) 00053 { 00054 CStringKernel<uint16_t>::init(p_l,p_r); 00055 CStringFeatures<uint16_t>* l=(CStringFeatures<uint16_t>*) p_l; 00056 ASSERT(l); 00057 CStringFeatures<uint16_t>* r=(CStringFeatures<uint16_t>*) p_r; 00058 ASSERT(r); 00059 00060 int32_t i; 00061 initialized=false; 00062 00063 if (sqrtdiag_lhs!=sqrtdiag_rhs) 00064 delete[] sqrtdiag_rhs; 00065 sqrtdiag_rhs=NULL; 00066 delete[] sqrtdiag_lhs; 00067 sqrtdiag_lhs=NULL; 00068 if (ld_mean_lhs!=ld_mean_rhs) 00069 delete[] ld_mean_rhs; 00070 ld_mean_rhs=NULL; 00071 delete[] ld_mean_lhs; 00072 ld_mean_lhs=NULL; 00073 00074 sqrtdiag_lhs=new float64_t[l->get_num_vectors()]; 00075 ld_mean_lhs=new float64_t[l->get_num_vectors()]; 00076 00077 for (i=0; i<l->get_num_vectors(); i++) 00078 sqrtdiag_lhs[i]=1; 00079 00080 if (l==r) 00081 { 00082 sqrtdiag_rhs=sqrtdiag_lhs; 00083 ld_mean_rhs=ld_mean_lhs; 00084 } 00085 else 00086 { 00087 sqrtdiag_rhs=new float64_t[r->get_num_vectors()]; 00088 for (i=0; i<r->get_num_vectors(); i++) 00089 sqrtdiag_rhs[i]=1; 00090 00091 ld_mean_rhs=new float64_t[r->get_num_vectors()]; 00092 } 00093 00094 float64_t* l_ld_mean_lhs=ld_mean_lhs; 00095 float64_t* l_ld_mean_rhs=ld_mean_rhs; 00096 00097 //from our knowledge first normalize variance to 1 and then norm=1 does the job 00098 if (!initialized) 00099 { 00100 int32_t num_vectors=l->get_num_vectors(); 00101 num_symbols=(int32_t) l->get_num_symbols(); 00102 int32_t llen=l->get_vector_length(0); 00103 int32_t rlen=r->get_vector_length(0); 00104 num_params=(int32_t) llen*l->get_num_symbols(); 00105 int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols(); 00106 if ((!estimate) || (!estimate->check_models())) 00107 { 00108 SG_ERROR( "no estimate available\n"); 00109 return false ; 00110 } ; 00111 if (num_params2!=estimate->get_num_params()) 00112 { 00113 SG_ERROR( "number of parameters of estimate and feature representation do not match\n"); 00114 return false ; 00115 } ; 00116 00117 delete[] variance; 00118 delete[] mean; 00119 mean=new float64_t[num_params]; 00120 ASSERT(mean); 00121 variance=new float64_t[num_params]; 00122 ASSERT(variance); 00123 00124 for (i=0; i<num_params; i++) 00125 { 00126 mean[i]=0; 00127 variance[i]=0; 00128 } 00129 00130 00131 // compute mean 00132 for (i=0; i<num_vectors; i++) 00133 { 00134 int32_t len; 00135 bool free_vec; 00136 uint16_t* vec=l->get_feature_vector(i, len, free_vec); 00137 00138 for (int32_t j=0; j<len; j++) 00139 { 00140 int32_t idx=compute_index(j, vec[j]); 00141 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ; 00142 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ; 00143 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00144 00145 mean[idx] += value/num_vectors ; 00146 } 00147 l->free_feature_vector(vec, i, free_vec); 00148 } 00149 00150 // compute variance 00151 for (i=0; i<num_vectors; i++) 00152 { 00153 int32_t len; 00154 bool free_vec; 00155 uint16_t* vec=l->get_feature_vector(i, len, free_vec); 00156 00157 for (int32_t j=0; j<len; j++) 00158 { 00159 for (int32_t k=0; k<4; k++) 00160 { 00161 int32_t idx=compute_index(j, k); 00162 if (k!=vec[j]) 00163 variance[idx]+=mean[idx]*mean[idx]/num_vectors; 00164 else 00165 { 00166 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ; 00167 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ; 00168 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00169 00170 variance[idx] += CMath::sq(value-mean[idx])/num_vectors; 00171 } 00172 } 00173 } 00174 l->free_feature_vector(vec, i, free_vec); 00175 } 00176 00177 00178 // compute sum_i m_i^2/s_i^2 00179 sum_m2_s2=0 ; 00180 for (i=0; i<num_params; i++) 00181 { 00182 if (variance[i]<1e-14) // then it is likely to be numerical inaccuracy 00183 variance[i]=1 ; 00184 00185 //fprintf(stderr, "%i: mean=%1.2e std=%1.2e\n", i, mean[i], std[i]) ; 00186 sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ; 00187 } ; 00188 } 00189 00190 // compute sum of 00191 //result -= feature*mean[a_idx]/variance[a_idx] ; 00192 00193 for (i=0; i<l->get_num_vectors(); i++) 00194 { 00195 int32_t alen ; 00196 bool free_avec; 00197 uint16_t* avec=l->get_feature_vector(i, alen, free_avec); 00198 float64_t result=0 ; 00199 for (int32_t j=0; j<alen; j++) 00200 { 00201 int32_t a_idx = compute_index(j, avec[j]) ; 00202 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ; 00203 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ; 00204 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00205 00206 if (variance[a_idx]!=0) 00207 result-=value*mean[a_idx]/variance[a_idx]; 00208 } 00209 ld_mean_lhs[i]=result ; 00210 00211 l->free_feature_vector(avec, i, free_avec); 00212 } 00213 00214 if (ld_mean_lhs!=ld_mean_rhs) 00215 { 00216 // compute sum of 00217 //result -= feature*mean[b_idx]/variance[b_idx] ; 00218 for (i=0; i<r->get_num_vectors(); i++) 00219 { 00220 int32_t alen; 00221 bool free_avec; 00222 uint16_t* avec=r->get_feature_vector(i, alen, free_avec); 00223 float64_t result=0; 00224 00225 for (int32_t j=0; j<alen; j++) 00226 { 00227 int32_t a_idx = compute_index(j, avec[j]) ; 00228 float64_t theta_p=1/estimate->log_derivative_pos_obsolete( 00229 avec[j], j) ; 00230 float64_t theta_n=1/estimate->log_derivative_neg_obsolete( 00231 avec[j], j) ; 00232 float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n)); 00233 00234 result -= value*mean[a_idx]/variance[a_idx] ; 00235 } 00236 00237 ld_mean_rhs[i]=result; 00238 l->free_feature_vector(avec, i, free_avec); 00239 } 00240 } 00241 00242 //warning hacky 00243 // 00244 this->lhs=l; 00245 this->rhs=l; 00246 ld_mean_lhs = l_ld_mean_lhs ; 00247 ld_mean_rhs = l_ld_mean_lhs ; 00248 00249 //compute normalize to 1 values 00250 for (i=0; i<lhs->get_num_vectors(); i++) 00251 { 00252 sqrtdiag_lhs[i]=sqrt(compute(i,i)); 00253 00254 //trap divide by zero exception 00255 if (sqrtdiag_lhs[i]==0) 00256 sqrtdiag_lhs[i]=1e-16; 00257 } 00258 00259 // if lhs is different from rhs (train/test data) 00260 // compute also the normalization for rhs 00261 if (sqrtdiag_lhs!=sqrtdiag_rhs) 00262 { 00263 this->lhs=r; 00264 this->rhs=r; 00265 ld_mean_lhs = l_ld_mean_rhs ; 00266 ld_mean_rhs = l_ld_mean_rhs ; 00267 00268 //compute normalize to 1 values 00269 for (i=0; i<rhs->get_num_vectors(); i++) 00270 { 00271 sqrtdiag_rhs[i]=sqrt(compute(i,i)); 00272 00273 //trap divide by zero exception 00274 if (sqrtdiag_rhs[i]==0) 00275 sqrtdiag_rhs[i]=1e-16; 00276 } 00277 } 00278 00279 this->lhs=l; 00280 this->rhs=r; 00281 ld_mean_lhs = l_ld_mean_lhs ; 00282 ld_mean_rhs = l_ld_mean_rhs ; 00283 00284 initialized = true ; 00285 return init_normalizer(); 00286 } 00287 00288 void CSalzbergWordStringKernel::cleanup() 00289 { 00290 delete[] variance; 00291 variance=NULL; 00292 00293 delete[] mean; 00294 mean=NULL; 00295 00296 if (sqrtdiag_lhs != sqrtdiag_rhs) 00297 delete[] sqrtdiag_rhs; 00298 sqrtdiag_rhs=NULL; 00299 00300 delete[] sqrtdiag_lhs; 00301 sqrtdiag_lhs=NULL; 00302 00303 if (ld_mean_lhs!=ld_mean_rhs) 00304 delete[] ld_mean_rhs ; 00305 ld_mean_rhs=NULL; 00306 00307 delete[] ld_mean_lhs ; 00308 ld_mean_lhs=NULL; 00309 00310 CKernel::cleanup(); 00311 } 00312 00313 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b) 00314 { 00315 int32_t alen, blen; 00316 bool free_avec, free_bvec; 00317 uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00318 uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00319 // can only deal with strings of same length 00320 ASSERT(alen==blen); 00321 00322 float64_t result = sum_m2_s2 ; // does not contain 0-th element 00323 00324 for (int32_t i=0; i<alen; i++) 00325 { 00326 if (avec[i]==bvec[i]) 00327 { 00328 int32_t a_idx = compute_index(i, avec[i]) ; 00329 00330 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ; 00331 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ; 00332 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00333 00334 result += value*value/variance[a_idx] ; 00335 } 00336 } 00337 result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ; 00338 00339 ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00340 ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00341 00342 if (initialized) 00343 result /= (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ; 00344 00345 return result; 00346 } 00347 00348 void CSalzbergWordStringKernel::set_prior_probs_from_labels(CLabels* labels) 00349 { 00350 ASSERT(labels); 00351 00352 int32_t num_pos=0, num_neg=0; 00353 for (int32_t i=0; i<labels->get_num_labels(); i++) 00354 { 00355 if (labels->get_int_label(i)==1) 00356 num_pos++; 00357 if (labels->get_int_label(i)==-1) 00358 num_neg++; 00359 } 00360 00361 SG_INFO("priors: pos=%1.3f (%i) neg=%1.3f (%i)\n", 00362 (float64_t) num_pos/(num_pos+num_neg), num_pos, 00363 (float64_t) num_neg/(num_pos+num_neg), num_neg); 00364 00365 set_prior_probs( 00366 (float64_t)num_pos/(num_pos+num_neg), 00367 (float64_t)num_neg/(num_pos+num_neg)); 00368 }