|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "features/WDFeatures.h" 00012 #include "lib/io.h" 00013 00014 using namespace shogun; 00015 00016 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str, 00017 int32_t order, int32_t from_order) : CDotFeatures() 00018 { 00019 ASSERT(str); 00020 ASSERT(str->have_same_length()); 00021 SG_REF(str); 00022 00023 strings=str; 00024 string_length=str->get_max_vector_length(); 00025 num_strings=str->get_num_vectors(); 00026 CAlphabet* alpha=str->get_alphabet(); 00027 alphabet_size=alpha->get_num_symbols(); 00028 SG_UNREF(alpha); 00029 00030 degree=order; 00031 from_degree=from_order; 00032 wd_weights=NULL; 00033 set_wd_weights(); 00034 set_normalization_const(); 00035 00036 } 00037 00038 CWDFeatures::CWDFeatures(const CWDFeatures& orig) 00039 : CDotFeatures(orig), strings(orig.strings), 00040 degree(orig.degree), from_degree(orig.from_degree), 00041 normalization_const(orig.normalization_const) 00042 { 00043 SG_REF(strings); 00044 string_length=strings->get_max_vector_length(); 00045 num_strings=strings->get_num_vectors(); 00046 CAlphabet* alpha=strings->get_alphabet(); 00047 alphabet_size=alpha->get_num_symbols(); 00048 SG_UNREF(alpha); 00049 00050 wd_weights=NULL; 00051 set_wd_weights(); 00052 } 00053 00054 CWDFeatures::~CWDFeatures() 00055 { 00056 SG_UNREF(strings); 00057 delete[] wd_weights; 00058 } 00059 00060 float64_t CWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2) 00061 { 00062 int32_t len1, len2; 00063 bool free_vec1, free_vec2; 00064 00065 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00066 uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2); 00067 00068 ASSERT(len1==len2); 00069 00070 float64_t sum=0.0; 00071 00072 for (int32_t i=0; i<len1; i++) 00073 { 00074 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00075 { 00076 if (vec1[i+j]!=vec2[i+j]) 00077 break ; 00078 sum += wd_weights[j]*wd_weights[j]; 00079 } 00080 } 00081 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00082 strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00083 return sum/CMath::sq(normalization_const); 00084 } 00085 00086 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00087 { 00088 if (vec2_len != w_dim) 00089 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00090 00091 float64_t sum=0; 00092 int32_t lim=CMath::min(degree, string_length); 00093 int32_t len; 00094 bool free_vec1; 00095 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00096 int32_t* val=new int32_t[len]; 00097 CMath::fill_vector(val, len, 0); 00098 00099 int32_t asize=alphabet_size; 00100 int32_t asizem1=1; 00101 int32_t offs=0; 00102 00103 for (int32_t k=0; k<lim; k++) 00104 { 00105 float64_t wd = wd_weights[k]; 00106 00107 int32_t o=offs; 00108 for (int32_t i=0; i+k < len; i++) 00109 { 00110 val[i]+=asizem1*vec[i+k]; 00111 sum+=vec2[val[i]+o]*wd; 00112 o+=asize; 00113 } 00114 offs+=asize*len; 00115 asize*=alphabet_size; 00116 asizem1*=alphabet_size; 00117 } 00118 delete[] val; 00119 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00120 00121 return sum/normalization_const; 00122 } 00123 00124 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00125 { 00126 if (vec2_len != w_dim) 00127 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00128 00129 int32_t lim=CMath::min(degree, string_length); 00130 int32_t len; 00131 bool free_vec1; 00132 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00133 int32_t* val=new int32_t[len]; 00134 CMath::fill_vector(val, len, 0); 00135 00136 int32_t asize=alphabet_size; 00137 int32_t asizem1=1; 00138 int32_t offs=0; 00139 00140 for (int32_t k=0; k<lim; k++) 00141 { 00142 float64_t wd = alpha*wd_weights[k]/normalization_const; 00143 00144 if (abs_val) 00145 wd=CMath::abs(wd); 00146 00147 int32_t o=offs; 00148 for (int32_t i=0; i+k < len; i++) 00149 { 00150 val[i]+=asizem1*vec[i+k]; 00151 vec2[val[i]+o]+=wd; 00152 o+=asize; 00153 } 00154 offs+=asize*len; 00155 asize*=alphabet_size; 00156 asizem1*=alphabet_size; 00157 } 00158 delete[] val; 00159 00160 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00161 } 00162 00163 void CWDFeatures::set_wd_weights() 00164 { 00165 ASSERT(degree>0 && degree<=8); 00166 delete[] wd_weights; 00167 wd_weights=new float64_t[degree]; 00168 w_dim=0; 00169 00170 for (int32_t i=0; i<degree; i++) 00171 { 00172 w_dim+=CMath::pow(alphabet_size, i+1)*string_length; 00173 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00174 } 00175 SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length); 00176 } 00177 00178 00179 void CWDFeatures::set_normalization_const(float64_t n) 00180 { 00181 if (n==0) 00182 { 00183 normalization_const=0; 00184 for (int32_t i=0; i<degree; i++) 00185 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00186 00187 normalization_const=CMath::sqrt(normalization_const); 00188 } 00189 else 00190 normalization_const=n; 00191 00192 SG_DEBUG("normalization_const:%f\n", normalization_const); 00193 } 00194 00195 void* CWDFeatures::get_feature_iterator(int32_t vector_index) 00196 { 00197 if (vector_index>=num_strings) 00198 { 00199 SG_ERROR("Index out of bounds (number of strings %d, you " 00200 "requested %d)\n", num_strings, vector_index); 00201 } 00202 00203 wd_feature_iterator* it=new wd_feature_iterator[1]; 00204 00205 it->lim=CMath::min(degree, string_length); 00206 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00207 it->vidx=vector_index; 00208 00209 it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00210 it->val=new int32_t[it->vlen]; 00211 CMath::fill_vector(it->val, it->vlen, 0); 00212 00213 it->asize=alphabet_size; 00214 it->asizem1=1; 00215 it->offs=0; 00216 it->k=0; 00217 it->i=0; 00218 it->o=0; 00219 00220 return it; 00221 } 00222 00223 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00224 { 00225 wd_feature_iterator* it=(wd_feature_iterator*) iterator; 00226 00227 if (it->i + it->k >= it->vlen) 00228 { 00229 if (it->k < it->lim-1) 00230 { 00231 it->offs+=it->asize*it->vlen; 00232 it->asize*=alphabet_size; 00233 it->asizem1*=alphabet_size; 00234 it->k++; 00235 it->i=0; 00236 it->o=it->offs; 00237 } 00238 else 00239 return false; 00240 } 00241 00242 int32_t i=it->i; 00243 int32_t k=it->k; 00244 #ifdef DEBUG_WDFEATURES 00245 SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1); 00246 #endif 00247 00248 it->val[i]+=it->asizem1*it->vec[i+k]; 00249 value=wd_weights[k]/normalization_const; 00250 index=it->val[i]+it->o; 00251 #ifdef DEBUG_WDFEATURES 00252 SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen); 00253 #endif 00254 00255 it->o+=it->asize; 00256 it->i=i+1; 00257 00258 return true; 00259 } 00260 00261 void CWDFeatures::free_feature_iterator(void* iterator) 00262 { 00263 ASSERT(iterator); 00264 wd_feature_iterator* it=(wd_feature_iterator*) iterator; 00265 strings->free_feature_vector(it->vec, it->vidx, it->vfree); 00266 delete[] it->val; 00267 delete[] it; 00268 } 00269 00270 CFeatures* CWDFeatures::duplicate() const 00271 { 00272 return new CWDFeatures(*this); 00273 }