|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "distributions/LinearHMM.h" 00013 #include "lib/common.h" 00014 #include "features/StringFeatures.h" 00015 #include "lib/io.h" 00016 00017 using namespace shogun; 00018 00019 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f) 00020 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL) 00021 { 00022 set_features(f); 00023 sequence_length = f->get_vector_length(0); 00024 num_symbols = (int32_t) f->get_num_symbols(); 00025 num_params = sequence_length*num_symbols; 00026 } 00027 00028 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols) 00029 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL) 00030 { 00031 sequence_length = p_num_features; 00032 num_symbols = p_num_symbols; 00033 num_params = sequence_length*num_symbols; 00034 } 00035 00036 CLinearHMM::~CLinearHMM() 00037 { 00038 delete[] transition_probs; 00039 delete[] log_transition_probs; 00040 } 00041 00042 bool CLinearHMM::train(CFeatures* data) 00043 { 00044 if (data) 00045 { 00046 if (data->get_feature_class() != C_STRING || 00047 data->get_feature_type() != F_WORD) 00048 { 00049 SG_ERROR("Expected features of class string type word!\n"); 00050 } 00051 set_features(data); 00052 } 00053 delete[] transition_probs; 00054 delete[] log_transition_probs; 00055 int32_t* int_transition_probs=new int32_t[num_params]; 00056 00057 int32_t vec; 00058 int32_t i; 00059 00060 for (i=0; i< num_params; i++) 00061 int_transition_probs[i]=0; 00062 00063 for (vec=0; vec<features->get_num_vectors(); vec++) 00064 { 00065 int32_t len; 00066 bool free_vec; 00067 00068 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00069 get_feature_vector(vec, len, free_vec); 00070 00071 //just count the symbols per position -> transition_probsogram 00072 for (int32_t feat=0; feat<len ; feat++) 00073 int_transition_probs[feat*num_symbols+vector[feat]]++; 00074 00075 ((CStringFeatures<uint16_t>*) features)-> 00076 free_feature_vector(vector, vec, free_vec); 00077 } 00078 00079 //trade memory for speed 00080 transition_probs=new float64_t[num_params]; 00081 log_transition_probs=new float64_t[num_params]; 00082 00083 for (i=0;i<sequence_length;i++) 00084 { 00085 for (int32_t j=0; j<num_symbols; j++) 00086 { 00087 float64_t sum=0; 00088 int32_t offs=i*num_symbols+ 00089 ((CStringFeatures<uint16_t> *) features)-> 00090 get_masked_symbols((uint16_t)j,(uint8_t) 254); 00091 int32_t original_num_symbols=(int32_t) 00092 ((CStringFeatures<uint16_t> *) features)-> 00093 get_original_num_symbols(); 00094 00095 for (int32_t k=0; k<original_num_symbols; k++) 00096 sum+=int_transition_probs[offs+k]; 00097 00098 transition_probs[i*num_symbols+j]= 00099 (int_transition_probs[i*num_symbols+j]+pseudo_count)/ 00100 (sum+((CStringFeatures<uint16_t> *) features)-> 00101 get_original_num_symbols()*pseudo_count); 00102 log_transition_probs[i*num_symbols+j]= 00103 log(transition_probs[i*num_symbols+j]); 00104 } 00105 } 00106 00107 delete[] int_transition_probs; 00108 return true; 00109 } 00110 00111 bool CLinearHMM::train( 00112 const int32_t* indizes, int32_t num_indizes, float64_t pseudo) 00113 { 00114 delete[] transition_probs; 00115 delete[] log_transition_probs; 00116 int32_t* int_transition_probs=new int32_t[num_params]; 00117 int32_t vec; 00118 int32_t i; 00119 00120 for (i=0; i< num_params; i++) 00121 int_transition_probs[i]=0; 00122 00123 for (vec=0; vec<num_indizes; vec++) 00124 { 00125 int32_t len; 00126 bool free_vec; 00127 00128 ASSERT(indizes[vec]>=0 && 00129 indizes[vec]<((CStringFeatures<uint16_t>*) features)-> 00130 get_num_vectors()); 00131 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00132 get_feature_vector(indizes[vec], len, free_vec); 00133 ((CStringFeatures<uint16_t>*) features)-> 00134 free_feature_vector(vector, indizes[vec], free_vec); 00135 00136 //just count the symbols per position -> transition_probsogram 00137 // 00138 for (int32_t feat=0; feat<len ; feat++) 00139 int_transition_probs[feat*num_symbols+vector[feat]]++; 00140 } 00141 00142 //trade memory for speed 00143 transition_probs=new float64_t[num_params]; 00144 log_transition_probs=new float64_t[num_params]; 00145 00146 for (i=0;i<sequence_length;i++) 00147 { 00148 for (int32_t j=0; j<num_symbols; j++) 00149 { 00150 float64_t sum=0; 00151 int32_t original_num_symbols=(int32_t) 00152 ((CStringFeatures<uint16_t> *) features)-> 00153 get_original_num_symbols(); 00154 for (int32_t k=0; k<original_num_symbols; k++) 00155 { 00156 sum+=int_transition_probs[i*num_symbols+ 00157 ((CStringFeatures<uint16_t>*) features)-> 00158 get_masked_symbols((uint16_t)j,(uint8_t) 254)+k]; 00159 } 00160 00161 transition_probs[i*num_symbols+j]= 00162 (int_transition_probs[i*num_symbols+j]+pseudo)/ 00163 (sum+((CStringFeatures<uint16_t>*) features)-> 00164 get_original_num_symbols()*pseudo); 00165 log_transition_probs[i*num_symbols+j]= 00166 log(transition_probs[i*num_symbols+j]); 00167 } 00168 } 00169 00170 delete[] int_transition_probs; 00171 return true; 00172 } 00173 00174 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len) 00175 { 00176 float64_t result=log_transition_probs[vector[0]]; 00177 00178 for (int32_t i=1; i<len; i++) 00179 result+=log_transition_probs[i*num_symbols+vector[i]]; 00180 00181 return result; 00182 } 00183 00184 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example) 00185 { 00186 int32_t len; 00187 bool free_vec; 00188 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00189 get_feature_vector(num_example, len, free_vec); 00190 float64_t result=log_transition_probs[vector[0]]; 00191 00192 for (int32_t i=1; i<len; i++) 00193 result+=log_transition_probs[i*num_symbols+vector[i]]; 00194 00195 ((CStringFeatures<uint16_t>*) features)-> 00196 free_feature_vector(vector, num_example, free_vec); 00197 00198 return result; 00199 } 00200 00201 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len) 00202 { 00203 float64_t result=transition_probs[vector[0]]; 00204 00205 for (int32_t i=1; i<len; i++) 00206 result*=transition_probs[i*num_symbols+vector[i]]; 00207 00208 return result; 00209 } 00210 00211 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example) 00212 { 00213 int32_t len; 00214 bool free_vec; 00215 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00216 get_feature_vector(num_example, len, free_vec); 00217 float64_t result=0; 00218 int32_t position=num_param/num_symbols; 00219 ASSERT(position>=0 && position<len); 00220 uint16_t sym=(uint16_t) (num_param-position*num_symbols); 00221 00222 if (vector[position]==sym && transition_probs[num_param]!=0) 00223 result=1.0/transition_probs[num_param]; 00224 ((CStringFeatures<uint16_t>*) features)-> 00225 free_feature_vector(vector, num_example, free_vec); 00226 00227 return result; 00228 } 00229 00230 void CLinearHMM::get_transition_probs(float64_t** dst, int32_t* num) 00231 { 00232 *num=num_params; 00233 size_t sz=sizeof(*transition_probs)*(*num); 00234 *dst=(float64_t*) malloc(sz); 00235 ASSERT(dst); 00236 00237 memcpy(*dst, transition_probs, sz); 00238 } 00239 00240 bool CLinearHMM::set_transition_probs(const float64_t* src, int32_t num) 00241 { 00242 if (num!=-1) 00243 ASSERT(num==num_params); 00244 00245 if (!log_transition_probs) 00246 log_transition_probs=new float64_t[num_params]; 00247 00248 if (!transition_probs) 00249 transition_probs=new float64_t[num_params]; 00250 00251 for (int32_t i=0; i<num_params; i++) 00252 { 00253 transition_probs[i]=src[i]; 00254 log_transition_probs[i]=log(transition_probs[i]); 00255 } 00256 00257 return true; 00258 } 00259 00260 void CLinearHMM::get_log_transition_probs(float64_t** dst, int32_t* num) 00261 { 00262 *num=num_params; 00263 size_t sz=sizeof(*log_transition_probs)*(*num); 00264 *dst=(float64_t*) malloc(sz); 00265 ASSERT(dst); 00266 00267 memcpy(*dst, log_transition_probs, sz); 00268 } 00269 00270 bool CLinearHMM::set_log_transition_probs(const float64_t* src, int32_t num) 00271 { 00272 if (num!=-1) 00273 ASSERT(num==num_params); 00274 00275 if (!log_transition_probs) 00276 log_transition_probs=new float64_t[num_params]; 00277 00278 if (!transition_probs) 00279 transition_probs=new float64_t[num_params]; 00280 00281 for (int32_t i=0; i< num_params; i++) 00282 { 00283 log_transition_probs[i]=src[i]; 00284 transition_probs[i]=exp(log_transition_probs[i]); 00285 } 00286 00287 return true; 00288 } 00289 00290 00291 00292