|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <vector> 00013 00014 #include "lib/common.h" 00015 #include "lib/io.h" 00016 #include "lib/Signal.h" 00017 #include "lib/Trie.h" 00018 #include "base/Parallel.h" 00019 00020 #include "kernel/SpectrumRBFKernel.h" 00021 #include "features/Features.h" 00022 #include "features/StringFeatures.h" 00023 #include <math.h> 00024 00025 #include <vector> 00026 #include <string> 00027 #include <fstream> 00028 #include <cmath> 00029 00030 #include <assert.h> 00031 00032 #ifndef WIN32 00033 #include <pthread.h> 00034 #endif 00035 00036 00037 using namespace shogun; 00038 00039 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_) 00040 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00041 { 00042 lhs=NULL; 00043 rhs=NULL; 00044 00045 target_letter_0=-1 ; 00046 00047 AA_matrix=new float64_t[128*128]; 00048 00049 00050 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00051 00052 read_profiles_and_sequences(); 00053 00054 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00055 string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, IUPAC_AMINO_ACID); 00056 init(string_features, string_features); 00057 } 00058 00059 CSpectrumRBFKernel::CSpectrumRBFKernel( 00060 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_) 00061 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00062 { 00063 target_letter_0=-1 ; 00064 00065 AA_matrix=new float64_t[128*128]; 00066 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00067 00068 init(l, r); 00069 } 00070 00071 CSpectrumRBFKernel::~CSpectrumRBFKernel() 00072 { 00073 cleanup(); 00074 delete[] AA_matrix ; 00075 } 00076 00077 00078 void CSpectrumRBFKernel::remove_lhs() 00079 { 00080 00081 CKernel::remove_lhs(); 00082 } 00083 00084 void CSpectrumRBFKernel::read_profiles_and_sequences() 00085 { 00086 00087 int32_t aa_to_index[128];//profile 00088 aa_to_index['A'] = 0; 00089 aa_to_index['R'] = 1; 00090 aa_to_index['N'] = 2; 00091 aa_to_index['D'] = 3; 00092 aa_to_index['C'] = 4; 00093 aa_to_index['Q'] = 5; 00094 aa_to_index['E'] = 6; 00095 aa_to_index['G'] = 7; 00096 aa_to_index['H'] = 8; 00097 aa_to_index['I'] = 9; 00098 aa_to_index['L'] = 10; 00099 aa_to_index['K'] = 11; 00100 aa_to_index['M'] = 12; 00101 aa_to_index['F'] = 13; 00102 aa_to_index['P'] = 14; 00103 aa_to_index['S'] = 15; 00104 aa_to_index['T'] = 16; 00105 aa_to_index['W'] = 17; 00106 aa_to_index['Y'] = 18; 00107 aa_to_index['V'] = 19; 00108 SG_DEBUG("initializing background\n"); 00109 double background[20]; // profile 00110 background[0]=0.0799912015849807; //A 00111 background[1]=0.0484482507611578;//R 00112 background[2]=0.044293531582512;//N 00113 background[3]=0.0578891399707563;//D 00114 background[4]=0.0171846021407367;//C 00115 background[5]=0.0380578923048682;//Q 00116 background[6]=0.0638169929675978;//E 00117 background[7]=0.0760659374742852;//G 00118 background[8]=0.0223465499452473;//H 00119 background[9]=0.0550905793661343;//I 00120 background[10]=0.0866897071203864;//L 00121 background[11]=0.060458245507428;//K 00122 background[12]=0.0215379186368154;//M 00123 background[13]=0.0396348024787477;//F 00124 background[14]=0.0465746314476874;//P 00125 background[15]=0.0630028230885602;//S 00126 background[16]=0.0580394726014824;//T 00127 background[17]=0.0144991866213453;//W 00128 background[18]=0.03635438623143;//Y 00129 background[19]=0.0700241481678408;//V 00130 00131 00132 std::vector<std::string> seqs; 00133 //int32_t nof_sequences = 7329; 00134 00135 double C = 0.8; 00136 const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles"; 00137 std::ifstream fin(filename); 00138 00139 SG_DEBUG("Reading profiles from %s\n", filename); 00140 std::string line; 00141 while (!fin.eof()) 00142 { 00143 std::getline(fin, line); 00144 00145 if (line[0] == '>') // new sequence 00146 { 00147 int idx = line.find_first_of(' '); 00148 sequence_labels.push_back(line.substr(1,idx-1)); 00149 std::getline(fin, line); 00150 std::string orig_sequence = line; 00151 std::string sequence=""; 00152 00153 int len_line = line.length(); 00154 00155 // skip 3 lines 00156 00157 std::getline(fin, line); 00158 std::getline(fin, line); 00159 std::getline(fin, line); 00160 00161 profiles.push_back(std::vector<double>()); 00162 00163 std::vector<double>& curr_profile = profiles.back(); 00164 for (int i=0; i < len_line; ++i) 00165 { 00166 std::getline(fin, line); 00167 int a = line.find_first_not_of(' '); // index position 00168 int b = line.find_first_of(' ', a); // index position 00169 a = line.find_first_not_of(' ', b); // aa position 00170 b = line.find_first_of(' ', a); // aa position 00171 std::string aa=line.substr(a,b-a); 00172 if (0) //(aa =="B" || aa == "X" || aa == "Z") 00173 { 00174 int pos = seqs.size()+1; 00175 SG_DEBUG("Skipping aa in sequence %d\n", pos); 00176 continue; 00177 } 00178 else 00179 { 00180 sequence += aa; 00181 00182 a = line.find_first_not_of(' ', b); // beginning of block to ignore 00183 b = line.find_first_of(' ', a); // aa position 00184 00185 for (int j=0; j < 19; ++j) 00186 { 00187 a = line.find_first_not_of(' ', b); 00188 b = line.find_first_of(' ', a); 00189 } 00190 00191 int all_zeros = 1; 00192 // interesting block 00193 for (int j=0; j < 20; ++j) 00194 { 00195 a = line.find_first_not_of(' ', b); 00196 b = line.find_first_of(' ', a); 00197 double p = atof(line.substr(a, b-a).c_str()); 00198 if (p > 0) 00199 { 00200 all_zeros = 0; 00201 } 00202 double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00203 curr_profile.push_back(value); 00204 //SG_DEBUG("seq %d aa %d value %f p %f bg %f\n", i, j, value,p, background[j]); 00205 } 00206 00207 if (all_zeros) 00208 { 00209 SG_DEBUG(">>>>>>>>>>>>>>> all zeros"); 00210 if (aa !="B" && aa != "X" && aa != "Z") 00211 { 00212 //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]); 00213 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]]; 00214 double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00215 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]); 00216 curr_profile[(i*20) + aa_index] = value; 00217 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value); 00218 00219 /* 00220 for (int z=0; z <20; ++z) 00221 { 00222 SG_DEBUG(" %d \t %f\t", z, curr_profile[z]); 00223 } 00224 SG_DEBUG("\n"); 00225 */ 00226 } 00227 } 00228 } 00229 } 00230 00231 if (curr_profile.size() != 20 * sequence.length()) 00232 { 00233 SG_ERROR("Something's wrong with the profile.\n"); 00234 break; 00235 } 00236 00237 seqs.push_back(sequence); 00238 00239 00240 /* 00241 // 6 irrelevant lines 00242 for (int i=0; i < 6; ++i) 00243 { 00244 std::getline(fin, line); 00245 } 00246 // 00247 */ 00248 } 00249 } 00250 00251 fin.close(); 00252 00253 nof_sequences = seqs.size(); 00254 sequences = new T_STRING<char>[nof_sequences]; 00255 00256 int max_len = 0; 00257 for (int i=0; i < nof_sequences; ++i) 00258 { 00259 int len = seqs[i].length(); 00260 sequences[i].string = new char[len+1]; 00261 sequences[i].length = len; 00262 strcpy(sequences[i].string, seqs[i].c_str()); 00263 00264 if (len > max_len) max_len = len; 00265 } 00266 00267 max_sequence_length = max_len; 00268 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00269 00270 } 00271 00272 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r) 00273 { 00274 // >> profile 00275 /* 00276 read_profiles_and_sequences(); 00277 l = string_features; 00278 r = string_features; 00279 */ 00280 // << profile 00281 00282 int32_t lhs_changed=(lhs!=l); 00283 int32_t rhs_changed=(rhs!=r); 00284 00285 CStringKernel<char>::init(l,r); 00286 00287 SG_DEBUG("lhs_changed: %i\n", lhs_changed); 00288 SG_DEBUG("rhs_changed: %i\n", rhs_changed); 00289 00290 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00291 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00292 00293 SG_UNREF(alphabet); 00294 alphabet=sf_l->get_alphabet(); 00295 CAlphabet* ralphabet=sf_r->get_alphabet(); 00296 00297 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00298 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00299 00300 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()); 00301 SG_UNREF(ralphabet); 00302 00303 00304 return init_normalizer(); 00305 } 00306 00307 void CSpectrumRBFKernel::cleanup() 00308 { 00309 00310 SG_UNREF(alphabet); 00311 alphabet=NULL; 00312 00313 CKernel::cleanup(); 00314 } 00315 00316 inline bool isaa(char c) 00317 { 00318 if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z') 00319 return false ; 00320 return true ; 00321 } 00322 00323 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index) 00324 { 00325 //const char* AA = "ARNDCQEGHILKMFPSTWYV"; 00326 float64_t diff=0.0 ; 00327 00328 for (int i=0; i<seq_degree; i++) 00329 { 00330 if (!isaa(path[i])||!isaa(joint_seq[index+i])) 00331 diff+=1.4 ; 00332 else 00333 { 00334 diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ; 00335 diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ; 00336 diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ; 00337 if (CMath::is_nan(diff)) 00338 fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ; 00339 } 00340 } 00341 00342 return exp( - diff/width) ; 00343 } 00344 00345 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b) 00346 { 00347 int32_t alen, blen; 00348 bool afree, bfree; 00349 00350 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree); 00351 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree); 00352 00353 float64_t result=0; 00354 for (int32_t i=0; i<alen; i++) 00355 { 00356 for (int32_t j=0; j<blen; j++) 00357 { 00358 if ((i+degree<=alen) && (j+degree<=blen)) 00359 result += AA_helper(&(avec[i]), degree, bvec, j) ; 00360 } 00361 } 00362 00363 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree); 00364 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree); 00365 return result; 00366 } 00367 00368 bool CSpectrumRBFKernel::set_AA_matrix( 00369 float64_t* AA_matrix_) 00370 { 00371 00372 if (AA_matrix_) 00373 { 00374 SG_DEBUG("Setting AA_matrix\n") ; 00375 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00376 return true ; 00377 } 00378 00379 return false; 00380 }