|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "features/SNPFeatures.h" 00012 #include "lib/io.h" 00013 #include "features/Alphabet.h" 00014 00015 using namespace shogun; 00016 00017 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(), 00018 m_str_min(NULL), m_str_maj(NULL) 00019 { 00020 ASSERT(str); 00021 ASSERT(str->have_same_length()); 00022 SG_REF(str); 00023 00024 strings=str; 00025 string_length=str->get_max_vector_length(); 00026 ASSERT((string_length & 1) == 0); // length divisible by 2 00027 w_dim=3*string_length/2; 00028 num_strings=str->get_num_vectors(); 00029 CAlphabet* alpha=str->get_alphabet(); 00030 ASSERT(alpha->get_alphabet()==SNP); 00031 SG_UNREF(alpha); 00032 00033 obtain_base_strings(); 00034 set_normalization_const(); 00035 00036 } 00037 00038 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig) 00039 : CDotFeatures(orig), strings(orig.strings), 00040 normalization_const(orig.normalization_const), 00041 m_str_min(NULL), m_str_maj(NULL) 00042 { 00043 SG_REF(strings); 00044 string_length=strings->get_max_vector_length(); 00045 ASSERT((string_length & 1) == 0); // length divisible by 2 00046 w_dim=3*string_length; 00047 num_strings=strings->get_num_vectors(); 00048 CAlphabet* alpha=strings->get_alphabet(); 00049 SG_UNREF(alpha); 00050 00051 obtain_base_strings(); 00052 } 00053 00054 CSNPFeatures::~CSNPFeatures() 00055 { 00056 SG_UNREF(strings); 00057 } 00058 00059 float64_t CSNPFeatures::dot(int32_t idx_a, int32_t idx_b) 00060 { 00061 int32_t alen, blen; 00062 bool free_avec, free_bvec; 00063 00064 uint8_t* avec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(idx_a, alen, free_avec); 00065 uint8_t* bvec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(idx_b, blen, free_bvec); 00066 00067 ASSERT(alen==blen); 00068 if (alen!=string_length) 00069 SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length); 00070 ASSERT(m_str_min); 00071 ASSERT(m_str_maj); 00072 00073 float64_t total=0; 00074 00075 for (int32_t i = 0; i<alen-1; i+=2) 00076 { 00077 int32_t sumaa=0; 00078 int32_t sumbb=0; 00079 int32_t sumab=0; 00080 00081 uint8_t a1=avec[i]; 00082 uint8_t a2=avec[i+1]; 00083 uint8_t b1=bvec[i]; 00084 uint8_t b2=bvec[i+1]; 00085 00086 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0')) 00087 sumab++; 00088 else if (a1==a2 && b1==b2) 00089 { 00090 if (a1!=b1) 00091 continue; 00092 00093 if (a1==m_str_min[i]) 00094 sumaa++; 00095 else if (a1==m_str_maj[i]) 00096 sumbb++; 00097 else 00098 { 00099 SG_ERROR("The impossible happened i=%d a1=%c " 00100 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]); 00101 } 00102 00103 } 00104 total+=sumaa+sumbb+sumab; 00105 } 00106 00107 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(avec, idx_a, free_avec); 00108 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(bvec, idx_b, free_bvec); 00109 return total; 00110 } 00111 00112 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00113 { 00114 if (vec2_len != w_dim) 00115 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00116 00117 float64_t sum=0; 00118 int32_t len; 00119 bool free_vec1; 00120 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00121 int32_t offs=0; 00122 00123 for (int32_t i=0; i<len; i+=2) 00124 { 00125 int32_t dim=0; 00126 00127 char a1=vec[i]; 00128 char a2=vec[i+1]; 00129 00130 if (a1==a2 && a1!='0' && a2!='0') 00131 { 00132 if (a1==m_str_min[i]) 00133 dim=1; 00134 else if (a1==m_str_maj[i]) 00135 dim=2; 00136 else 00137 { 00138 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00139 i, a1,a2, m_str_min[i], m_str_maj[i]); 00140 } 00141 } 00142 00143 sum+=vec2[offs+dim]; 00144 offs+=3; 00145 } 00146 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00147 00148 return sum/normalization_const; 00149 } 00150 00151 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00152 { 00153 if (vec2_len != w_dim) 00154 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00155 00156 int32_t len; 00157 bool free_vec1; 00158 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00159 int32_t offs=0; 00160 00161 if (abs_val) 00162 alpha=CMath::abs(alpha); 00163 00164 for (int32_t i=0; i<len; i+=2) 00165 { 00166 int32_t dim=0; 00167 00168 char a1=vec[i]; 00169 char a2=vec[i+1]; 00170 00171 if (a1==a2 && a1!='0' && a2!='0') 00172 { 00173 if (a1==m_str_min[i]) 00174 dim=1; 00175 else if (a1==m_str_maj[i]) 00176 dim=2; 00177 else 00178 { 00179 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00180 i, a1,a2, m_str_min[i], m_str_maj[i]); 00181 } 00182 } 00183 00184 vec2[offs+dim]+=alpha; 00185 offs+=3; 00186 } 00187 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00188 } 00189 00190 void CSNPFeatures::obtain_base_strings() 00191 { 00192 for (int32_t i=0; i<num_strings; i++) 00193 { 00194 int32_t len; 00195 bool free_vec; 00196 uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec); 00197 ASSERT(string_length==len); 00198 00199 if (i==0) 00200 { 00201 size_t tlen=(len+1)*sizeof(uint8_t); 00202 m_str_min=(uint8_t*) malloc(tlen); 00203 m_str_maj=(uint8_t*) malloc(tlen); 00204 memset(m_str_min, 0, tlen); 00205 memset(m_str_maj, 0, tlen); 00206 } 00207 00208 for (int32_t j=0; j<len; j++) 00209 { 00210 // skip sequencing errors 00211 if (vec[j]=='0') 00212 continue; 00213 00214 if (m_str_min[j]==0) 00215 m_str_min[j]=vec[j]; 00216 else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j]) 00217 m_str_maj[j]=vec[j]; 00218 } 00219 00220 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec); 00221 } 00222 00223 for (int32_t j=0; j<string_length; j++) 00224 { 00225 // if only one symbol occurs use 0 00226 if (m_str_min[j]==0) 00227 m_str_min[j]='0'; 00228 if (m_str_maj[j]==0) 00229 m_str_maj[j]='0'; 00230 00231 if (m_str_min[j]>m_str_maj[j]) 00232 CMath::swap(m_str_min[j], m_str_maj[j]); 00233 } 00234 } 00235 00236 void CSNPFeatures::set_normalization_const(float64_t n) 00237 { 00238 if (n==0) 00239 { 00240 normalization_const=string_length; 00241 normalization_const=CMath::sqrt(normalization_const); 00242 } 00243 else 00244 normalization_const=n; 00245 00246 SG_DEBUG("normalization_const:%f\n", normalization_const); 00247 } 00248 00249 void* CSNPFeatures::get_feature_iterator(int32_t vector_index) 00250 { 00251 return NULL; 00252 } 00253 00254 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00255 { 00256 return false; 00257 } 00258 00259 void CSNPFeatures::free_feature_iterator(void* iterator) 00260 { 00261 } 00262 00263 CFeatures* CSNPFeatures::duplicate() const 00264 { 00265 return new CSNPFeatures(*this); 00266 }