|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 00011 #include "features/HashedWDFeatures.h" 00012 #include "lib/io.h" 00013 00014 using namespace shogun; 00015 00016 CHashedWDFeatures::CHashedWDFeatures(CStringFeatures<uint8_t>* str, 00017 int32_t start_order, int32_t order, int32_t from_order, 00018 int32_t hash_bits) : CDotFeatures() 00019 { 00020 ASSERT(start_order>=0); 00021 ASSERT(start_order<order); 00022 ASSERT(order<=from_order); 00023 ASSERT(hash_bits>0); 00024 ASSERT(str); 00025 ASSERT(str->have_same_length()); 00026 SG_REF(str); 00027 00028 strings=str; 00029 string_length=str->get_max_vector_length(); 00030 num_strings=str->get_num_vectors(); 00031 CAlphabet* alpha=str->get_alphabet(); 00032 alphabet_size=alpha->get_num_symbols(); 00033 SG_UNREF(alpha); 00034 00035 degree=order; 00036 start_degree=start_order; 00037 from_degree=from_order; 00038 m_hash_bits=hash_bits; 00039 set_wd_weights(); 00040 set_normalization_const(); 00041 } 00042 00043 CHashedWDFeatures::CHashedWDFeatures(const CHashedWDFeatures& orig) 00044 : CDotFeatures(orig), strings(orig.strings), 00045 degree(orig.degree), start_degree(orig.start_degree), 00046 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits), 00047 normalization_const(orig.normalization_const) 00048 { 00049 SG_REF(strings); 00050 string_length=strings->get_max_vector_length(); 00051 num_strings=strings->get_num_vectors(); 00052 CAlphabet* alpha=strings->get_alphabet(); 00053 alphabet_size=alpha->get_num_symbols(); 00054 SG_UNREF(alpha); 00055 00056 set_wd_weights(); 00057 } 00058 00059 CHashedWDFeatures::~CHashedWDFeatures() 00060 { 00061 SG_UNREF(strings); 00062 delete[] wd_weights; 00063 } 00064 00065 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2) 00066 { 00067 int32_t len1, len2; 00068 bool free_vec1, free_vec2; 00069 00070 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00071 uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2); 00072 00073 ASSERT(len1==len2); 00074 00075 float64_t sum=0.0; 00076 00077 for (int32_t i=0; i<len1; i++) 00078 { 00079 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00080 { 00081 if (vec1[i+j]!=vec2[i+j]) 00082 break; 00083 if (j>=start_degree) 00084 sum += wd_weights[j]*wd_weights[j]; 00085 } 00086 } 00087 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00088 strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00089 return sum/CMath::sq(normalization_const); 00090 } 00091 00092 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00093 { 00094 if (vec2_len != w_dim) 00095 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00096 00097 float64_t sum=0; 00098 int32_t lim=CMath::min(degree, string_length); 00099 int32_t len; 00100 bool free_vec1; 00101 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00102 uint32_t* val=new uint32_t[len]; 00103 00104 uint32_t offs=0; 00105 00106 if (start_degree>0) 00107 { 00108 // compute hash for strings of length start_degree-1 00109 for (int32_t i=0; i+start_degree < len; i++) 00110 val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF); 00111 } 00112 else 00113 CMath::fill_vector(val, len, 0xDEADBEAF); 00114 00115 for (int32_t k=start_degree; k<lim; k++) 00116 { 00117 float64_t wd = wd_weights[k]; 00118 00119 uint32_t o=offs; 00120 for (int32_t i=0; i+k < len; i++) 00121 { 00122 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00123 val[i]=h; 00124 #ifdef DEBUG_HASHEDWD 00125 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00126 #endif 00127 sum+=vec2[o+(h & mask)]*wd; 00128 o+=partial_w_dim; 00129 } 00130 offs+=partial_w_dim*len; 00131 } 00132 delete[] val; 00133 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00134 00135 return sum/normalization_const; 00136 } 00137 00138 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00139 { 00140 if (vec2_len != w_dim) 00141 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00142 00143 int32_t lim=CMath::min(degree, string_length); 00144 int32_t len; 00145 bool free_vec1; 00146 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00147 uint32_t* val=new uint32_t[len]; 00148 00149 uint32_t offs=0; 00150 00151 if (start_degree>0) 00152 { 00153 // compute hash for strings of length start_degree-1 00154 for (int32_t i=0; i+start_degree < len; i++) 00155 val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF); 00156 } 00157 else 00158 CMath::fill_vector(val, len, 0xDEADBEAF); 00159 00160 for (int32_t k=start_degree; k<lim; k++) 00161 { 00162 float64_t wd = alpha*wd_weights[k]/normalization_const; 00163 00164 if (abs_val) 00165 wd=CMath::abs(wd); 00166 00167 uint32_t o=offs; 00168 for (int32_t i=0; i+k < len; i++) 00169 { 00170 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00171 val[i]=h; 00172 00173 #ifdef DEBUG_HASHEDWD 00174 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h); 00175 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00176 #endif 00177 vec2[o+(h & mask)]+=wd; 00178 o+=partial_w_dim; 00179 } 00180 offs+=partial_w_dim*len; 00181 } 00182 00183 delete[] val; 00184 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00185 } 00186 00187 void CHashedWDFeatures::set_wd_weights() 00188 { 00189 ASSERT(degree>0); 00190 00191 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00192 partial_w_dim=1<<m_hash_bits; 00193 w_dim=partial_w_dim*string_length*(degree-start_degree); 00194 00195 wd_weights=new float64_t[degree]; 00196 00197 for (int32_t i=0; i<degree; i++) 00198 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00199 00200 SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, " 00201 "dim=%d partial_dim=%d num=%d, len=%d\n", 00202 degree, from_degree, alphabet_size, 00203 w_dim, partial_w_dim, num_strings, string_length); 00204 } 00205 00206 00207 void CHashedWDFeatures::set_normalization_const(float64_t n) 00208 { 00209 if (n==0) 00210 { 00211 normalization_const=0; 00212 for (int32_t i=0; i<degree; i++) 00213 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00214 00215 normalization_const=CMath::sqrt(normalization_const); 00216 } 00217 else 00218 normalization_const=n; 00219 00220 SG_DEBUG("normalization_const:%f\n", normalization_const); 00221 } 00222 00223 CFeatures* CHashedWDFeatures::duplicate() const 00224 { 00225 return new CHashedWDFeatures(*this); 00226 }