|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "features/ImplicitWeightedSpecFeatures.h" 00012 #include "lib/io.h" 00013 00014 using namespace shogun; 00015 00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures() 00017 { 00018 ASSERT(str); 00019 strings=str; 00020 SG_REF(strings) 00021 normalization_factors=NULL; 00022 spec_weights=NULL; 00023 num_strings = str->get_num_vectors(); 00024 alphabet_size = str->get_original_num_symbols(); 00025 degree=str->get_order(); 00026 set_wd_weights(); 00027 00028 SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size, 00029 spec_size, num_strings); 00030 00031 if (normalize) 00032 compute_normalization_const(); 00033 } 00034 00035 void CImplicitWeightedSpecFeatures::compute_normalization_const() 00036 { 00037 float64_t* factors=new float64_t[num_strings]; 00038 00039 for (int32_t i=0; i<num_strings; i++) 00040 factors[i]=1.0/CMath::sqrt(dot(i,i)); 00041 00042 normalization_factors=factors; 00043 //CMath::display_vector(normalization_factors, num_strings, "n"); 00044 } 00045 00046 bool CImplicitWeightedSpecFeatures::set_wd_weights() 00047 { 00048 delete[] spec_weights; 00049 spec_weights=new float64_t[degree]; 00050 00051 int32_t i; 00052 float64_t sum=0; 00053 spec_size=0; 00054 00055 for (i=0; i<degree; i++) 00056 { 00057 spec_size+=CMath::pow(alphabet_size, i+1); 00058 spec_weights[i]=degree-i; 00059 sum+=spec_weights[i]; 00060 } 00061 for (i=0; i<degree; i++) 00062 spec_weights[i]=CMath::sqrt(spec_weights[i]/sum); 00063 00064 return spec_weights!=NULL; 00065 } 00066 00067 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d) 00068 { 00069 ASSERT(d==degree); 00070 00071 delete[] spec_weights; 00072 spec_weights=new float64_t[degree]; 00073 for (int32_t i=0; i<degree; i++) 00074 spec_weights[i]=CMath::sqrt(w[i]); 00075 return true; 00076 } 00077 00078 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig), 00079 num_strings(orig.num_strings), 00080 alphabet_size(orig.alphabet_size), spec_size(orig.spec_size) 00081 { 00082 SG_NOTIMPLEMENTED; 00083 SG_REF(strings); 00084 } 00085 00086 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures() 00087 { 00088 SG_UNREF(strings); 00089 delete[] spec_weights; 00090 delete[] normalization_factors; 00091 } 00092 00093 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, int32_t vec_idx2) 00094 { 00095 ASSERT(vec_idx1 < num_strings); 00096 ASSERT(vec_idx2 < num_strings); 00097 00098 int32_t len1=-1; 00099 int32_t len2=-1; 00100 bool free_vec1; 00101 bool free_vec2; 00102 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00103 uint16_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2); 00104 00105 float64_t result=0; 00106 uint8_t mask=0; 00107 00108 for (int32_t d=0; d<degree; d++) 00109 { 00110 mask = mask | (1 << (degree-d-1)); 00111 uint16_t masked=strings->get_masked_symbols(0xffff, mask); 00112 00113 int32_t left_idx=0; 00114 int32_t right_idx=0; 00115 float64_t weight=spec_weights[d]*spec_weights[d]; 00116 00117 while (left_idx < len1 && right_idx < len2) 00118 { 00119 uint16_t lsym=vec1[left_idx] & masked; 00120 uint16_t rsym=vec2[right_idx] & masked; 00121 00122 if (lsym == rsym) 00123 { 00124 int32_t old_left_idx=left_idx; 00125 int32_t old_right_idx=right_idx; 00126 00127 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym) 00128 left_idx++; 00129 00130 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym) 00131 right_idx++; 00132 00133 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx); 00134 } 00135 else if (lsym<rsym) 00136 left_idx++; 00137 else 00138 right_idx++; 00139 } 00140 } 00141 00142 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00143 strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00144 00145 if (normalization_factors) 00146 return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2]; 00147 else 00148 return result; 00149 } 00150 00151 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00152 { 00153 ASSERT(vec2_len == spec_size); 00154 ASSERT(vec_idx1 < num_strings); 00155 00156 float64_t result=0; 00157 int32_t len1=-1; 00158 bool free_vec1; 00159 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00160 00161 if (vec1 && len1>0) 00162 { 00163 for (int32_t j=0; j<len1; j++) 00164 { 00165 uint8_t mask=0; 00166 int32_t offs=0; 00167 uint16_t v=*vec1++; 00168 00169 for (int32_t d=0; d<degree; d++) 00170 { 00171 mask = mask | (1 << (degree-d-1)); 00172 int32_t idx=strings->get_masked_symbols(v, mask); 00173 idx=strings->shift_symbol(idx, degree-d-1); 00174 result += vec2[offs + idx]*spec_weights[d]; 00175 offs+=strings->shift_offset(1,d+1); 00176 } 00177 } 00178 00179 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00180 00181 if (normalization_factors) 00182 result*=normalization_factors[vec_idx1]; 00183 } 00184 else 00185 SG_ERROR("huh?\n"); 00186 00187 return result; 00188 } 00189 00190 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00191 { 00192 int32_t len1=-1; 00193 bool free_vec1; 00194 uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00195 00196 if (normalization_factors) 00197 alpha*=normalization_factors[vec_idx1]; 00198 00199 if (vec && len1>0) 00200 { 00201 for (int32_t j=0; j<len1; j++) 00202 { 00203 uint8_t mask=0; 00204 int32_t offs=0; 00205 for (int32_t d=0; d<degree; d++) 00206 { 00207 mask = mask | (1 << (degree-d-1)); 00208 int32_t idx=strings->get_masked_symbols(vec[j], mask); 00209 idx=strings->shift_symbol(idx, degree-d-1); 00210 if (abs_val) 00211 vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]); 00212 else 00213 vec2[offs + idx] += alpha*spec_weights[d]; 00214 offs+=strings->shift_offset(1,d+1); 00215 } 00216 } 00217 } 00218 00219 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00220 } 00221 00222 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const 00223 { 00224 return new CImplicitWeightedSpecFeatures(*this); 00225 } 00226 00227 void* CImplicitWeightedSpecFeatures::get_feature_iterator(int32_t vector_index) 00228 { 00229 if (vector_index>=num_strings) 00230 { 00231 SG_ERROR("Index out of bounds (number of strings %d, you " 00232 "requested %d)\n", num_strings, vector_index); 00233 } 00234 00235 wspec_feature_iterator* it=new wspec_feature_iterator[1]; 00236 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00237 it->vidx=vector_index; 00238 00239 it->offs=0; 00240 it->d=0; 00241 it->j=0; 00242 it->mask=0; 00243 it->alpha=normalization_factors[vector_index]; 00244 00245 return it; 00246 } 00247 00248 bool CImplicitWeightedSpecFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00249 { 00250 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator; 00251 00252 if (it->d>=degree) 00253 { 00254 if (it->j < it->vlen-1) 00255 { 00256 it->j++; 00257 it->d=0; 00258 it->mask=0; 00259 it->offs=0; 00260 } 00261 else 00262 return false; 00263 } 00264 00265 int32_t d=it->d; 00266 00267 it->mask = it->mask | (1 << (degree-d-1)); 00268 int32_t idx=strings->get_masked_symbols(it->vec[it->j], it->mask); 00269 idx=strings->shift_symbol(idx, degree-d-1); 00270 value=it->alpha*spec_weights[d]; 00271 index=it->offs + idx; 00272 it->offs+=strings->shift_offset(1,d+1); 00273 00274 it->d=d+1; 00275 return true; 00276 } 00277 00278 void CImplicitWeightedSpecFeatures::free_feature_iterator(void* iterator) 00279 { 00280 ASSERT(iterator); 00281 wspec_feature_iterator* it=(wspec_feature_iterator*) iterator; 00282 strings->free_feature_vector(it->vec, it->vidx, it->vfree); 00283 delete[] it; 00284 }