|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Soeren Sonnenburg 00008 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <string.h> 00012 #include <math.h> 00013 00014 #include "features/Alphabet.h" 00015 #include "lib/io.h" 00016 00017 00018 #ifdef HAVE_BOOST_SERIALIZATION 00019 #include <boost/serialization/export.hpp> 00020 BOOST_CLASS_EXPORT(shogun::CAlphabet); 00021 #endif //HAVE_BOOST_SERIALIZATION 00022 00023 00024 using namespace shogun; 00025 00026 //define numbers for the bases 00027 const uint8_t CAlphabet::B_A=0; 00028 const uint8_t CAlphabet::B_C=1; 00029 const uint8_t CAlphabet::B_G=2; 00030 const uint8_t CAlphabet::B_T=3; 00031 const uint8_t CAlphabet::B_0=4; 00032 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff; 00033 const char* CAlphabet::alphabet_names[18]={ 00034 "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM", 00035 "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID", 00036 "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN", 00037 "SNP", "RAWSNP"}; 00038 00039 /* 00040 CAlphabet::CAlphabet() 00041 : CSGObject() 00042 { 00043 00044 } 00045 */ 00046 00047 CAlphabet::CAlphabet(char* al, int32_t len) 00048 : CSGObject() 00049 { 00050 EAlphabet alpha=NONE; 00051 00052 if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA"))) 00053 alpha = DNA; 00054 else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA"))) 00055 alpha = RAWDNA; 00056 else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA"))) 00057 alpha = RNA; 00058 else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN"))) 00059 alpha = PROTEIN; 00060 else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY"))) 00061 alpha = BINARY; 00062 else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM"))) 00063 alpha = ALPHANUM; 00064 else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE"))) 00065 alpha = CUBE; 00066 else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2"))) 00067 alpha = DIGIT2; 00068 else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT"))) 00069 alpha = DIGIT; 00070 else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2"))) 00071 alpha = RAWDIGIT2; 00072 else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT"))) 00073 alpha = RAWDIGIT; 00074 else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP"))) 00075 alpha = SNP; 00076 else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP"))) 00077 alpha = RAWSNP; 00078 else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) || 00079 (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW")))) 00080 alpha = RAWBYTE; 00081 else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID"))) 00082 alpha = IUPAC_NUCLEIC_ACID; 00083 else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID"))) 00084 alpha = IUPAC_AMINO_ACID; 00085 else { 00086 SG_ERROR( "unknown alphabet %s\n", al); 00087 } 00088 00089 set_alphabet(alpha); 00090 } 00091 00092 CAlphabet::CAlphabet(EAlphabet alpha) 00093 : CSGObject() 00094 { 00095 set_alphabet(alpha); 00096 } 00097 00098 CAlphabet::CAlphabet(CAlphabet* a) 00099 : CSGObject() 00100 { 00101 ASSERT(a); 00102 set_alphabet(a->get_alphabet()); 00103 copy_histogram(a); 00104 } 00105 00106 CAlphabet::~CAlphabet() 00107 { 00108 } 00109 00110 bool CAlphabet::set_alphabet(EAlphabet alpha) 00111 { 00112 bool result=true; 00113 alphabet=alpha; 00114 00115 switch (alphabet) 00116 { 00117 case DNA: 00118 case RAWDNA: 00119 num_symbols = 4; 00120 break; 00121 case RNA: 00122 num_symbols = 4; 00123 break; 00124 case PROTEIN: 00125 num_symbols = 26; 00126 break; 00127 case BINARY: 00128 num_symbols = 2; 00129 break; 00130 case ALPHANUM: 00131 num_symbols = 36; 00132 break; 00133 case CUBE: 00134 num_symbols = 6; 00135 break; 00136 case RAWBYTE: 00137 num_symbols = 256; 00138 break; 00139 case IUPAC_NUCLEIC_ACID: 00140 num_symbols = 16; 00141 break; 00142 case IUPAC_AMINO_ACID: 00143 num_symbols = 23; 00144 break; 00145 case NONE: 00146 num_symbols = 0; 00147 break; 00148 case DIGIT2: 00149 num_symbols = 3; 00150 break; 00151 case DIGIT: 00152 num_symbols = 10; 00153 break; 00154 case RAWDIGIT2: 00155 num_symbols = 3; 00156 break; 00157 case RAWDIGIT: 00158 num_symbols = 10; 00159 break; 00160 case SNP: 00161 num_symbols = 5; 00162 break; 00163 case RAWSNP: 00164 num_symbols = 5; 00165 break; 00166 default: 00167 num_symbols = 0; 00168 result=false; 00169 break; 00170 } 00171 00172 num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2)); 00173 init_map_table(); 00174 clear_histogram(); 00175 00176 SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet)); 00177 00178 return result; 00179 } 00180 00181 void CAlphabet::init_map_table() 00182 { 00183 for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++) 00184 { 00185 maptable_to_bin[i] = MAPTABLE_UNDEF; 00186 maptable_to_char[i] = MAPTABLE_UNDEF; 00187 valid_chars[i] = false; 00188 } 00189 00190 switch (alphabet) 00191 { 00192 case RAWDIGIT: 00193 for (uint8_t i=0; i<=9; i++) 00194 { 00195 valid_chars[i]=true; 00196 maptable_to_bin[i]=i; 00197 maptable_to_char[i]=i; 00198 } 00199 break; 00200 00201 case RAWDIGIT2: 00202 for (uint8_t i=0; i<=2; i++) 00203 { 00204 valid_chars[i]=true; 00205 maptable_to_bin[i]=i; 00206 maptable_to_char[i]=i; 00207 } 00208 break; 00209 00210 case DIGIT: 00211 valid_chars[(uint8_t) '0']=true; 00212 valid_chars[(uint8_t) '1']=true; 00213 valid_chars[(uint8_t) '2']=true; 00214 valid_chars[(uint8_t) '3']=true; 00215 valid_chars[(uint8_t) '4']=true; 00216 valid_chars[(uint8_t) '5']=true; 00217 valid_chars[(uint8_t) '6']=true; 00218 valid_chars[(uint8_t) '7']=true; 00219 valid_chars[(uint8_t) '8']=true; 00220 valid_chars[(uint8_t) '9']=true; //Translation '0-9' -> 0-9 00221 00222 maptable_to_bin[(uint8_t) '0']=0; 00223 maptable_to_bin[(uint8_t) '1']=1; 00224 maptable_to_bin[(uint8_t) '2']=2; 00225 maptable_to_bin[(uint8_t) '3']=3; 00226 maptable_to_bin[(uint8_t) '4']=4; 00227 maptable_to_bin[(uint8_t) '5']=5; 00228 maptable_to_bin[(uint8_t) '6']=6; 00229 maptable_to_bin[(uint8_t) '7']=7; 00230 maptable_to_bin[(uint8_t) '8']=8; 00231 maptable_to_bin[(uint8_t) '9']=9; //Translation '0-9' -> 0-9 00232 00233 maptable_to_char[(uint8_t) 0]='0'; 00234 maptable_to_char[(uint8_t) 1]='1'; 00235 maptable_to_char[(uint8_t) 2]='2'; 00236 maptable_to_char[(uint8_t) 3]='3'; 00237 maptable_to_char[(uint8_t) 4]='4'; 00238 maptable_to_char[(uint8_t) 5]='5'; 00239 maptable_to_char[(uint8_t) 6]='6'; 00240 maptable_to_char[(uint8_t) 7]='7'; 00241 maptable_to_char[(uint8_t) 8]='8'; 00242 maptable_to_char[(uint8_t) 9]='9'; //Translation 0-9 -> '0-9' 00243 break; 00244 00245 case DIGIT2: 00246 valid_chars[(uint8_t) '0']=true; 00247 valid_chars[(uint8_t) '1']=true; 00248 valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2 00249 00250 maptable_to_bin[(uint8_t) '0']=0; 00251 maptable_to_bin[(uint8_t) '1']=1; 00252 maptable_to_bin[(uint8_t) '2']=2; //Translation '0-2' -> 0-2 00253 00254 maptable_to_char[(uint8_t) 0]='0'; 00255 maptable_to_char[(uint8_t) 1]='1'; 00256 maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2' 00257 break; 00258 00259 case CUBE: 00260 valid_chars[(uint8_t) '1']=true; 00261 valid_chars[(uint8_t) '2']=true; 00262 valid_chars[(uint8_t) '3']=true; 00263 valid_chars[(uint8_t) '4']=true; 00264 valid_chars[(uint8_t) '5']=true; 00265 valid_chars[(uint8_t) '6']=true; //Translation '123456' -> 012345 00266 00267 maptable_to_bin[(uint8_t) '1']=0; 00268 maptable_to_bin[(uint8_t) '2']=1; 00269 maptable_to_bin[(uint8_t) '3']=2; 00270 maptable_to_bin[(uint8_t) '4']=3; 00271 maptable_to_bin[(uint8_t) '5']=4; 00272 maptable_to_bin[(uint8_t) '6']=5; //Translation '123456' -> 012345 00273 00274 maptable_to_char[(uint8_t) 0]='1'; 00275 maptable_to_char[(uint8_t) 1]='2'; 00276 maptable_to_char[(uint8_t) 2]='3'; 00277 maptable_to_char[(uint8_t) 3]='4'; 00278 maptable_to_char[(uint8_t) 4]='5'; 00279 maptable_to_char[(uint8_t) 5]='6'; //Translation 012345->'123456' 00280 break; 00281 00282 case PROTEIN: 00283 { 00284 int32_t skip=0 ; 00285 for (int32_t i=0; i<21; i++) 00286 { 00287 if (i==1) skip++ ; 00288 if (i==8) skip++ ; 00289 if (i==12) skip++ ; 00290 if (i==17) skip++ ; 00291 valid_chars['A'+i+skip]=true; 00292 maptable_to_bin['A'+i+skip]=i ; 00293 maptable_to_char[i]='A'+i+skip ; 00294 } ; //Translation 012345->acde...xy -- the protein code 00295 } ; 00296 break; 00297 00298 case BINARY: 00299 valid_chars[(uint8_t) '0']=true; 00300 valid_chars[(uint8_t) '1']=true; 00301 00302 maptable_to_bin[(uint8_t) '0']=0; 00303 maptable_to_bin[(uint8_t) '1']=1; 00304 00305 maptable_to_char[0]=(uint8_t) '0'; 00306 maptable_to_char[1]=(uint8_t) '1'; 00307 break; 00308 00309 case ALPHANUM: 00310 { 00311 for (int32_t i=0; i<26; i++) 00312 { 00313 valid_chars['A'+i]=true; 00314 maptable_to_bin['A'+i]=i ; 00315 maptable_to_char[i]='A'+i ; 00316 } ; 00317 for (int32_t i=0; i<10; i++) 00318 { 00319 valid_chars['0'+i]=true; 00320 maptable_to_bin['0'+i]=26+i ; 00321 maptable_to_char[26+i]='0'+i ; 00322 } ; //Translation 012345->acde...xy0123456789 00323 } ; 00324 break; 00325 00326 case RAWBYTE: 00327 { 00328 //identity 00329 for (int32_t i=0; i<256; i++) 00330 { 00331 valid_chars[i]=true; 00332 maptable_to_bin[i]=i; 00333 maptable_to_char[i]=i; 00334 } 00335 } 00336 break; 00337 00338 case DNA: 00339 valid_chars[(uint8_t) 'A']=true; 00340 valid_chars[(uint8_t) 'C']=true; 00341 valid_chars[(uint8_t) 'G']=true; 00342 valid_chars[(uint8_t) 'T']=true; 00343 00344 maptable_to_bin[(uint8_t) 'A']=B_A; 00345 maptable_to_bin[(uint8_t) 'C']=B_C; 00346 maptable_to_bin[(uint8_t) 'G']=B_G; 00347 maptable_to_bin[(uint8_t) 'T']=B_T; 00348 00349 maptable_to_char[B_A]='A'; 00350 maptable_to_char[B_C]='C'; 00351 maptable_to_char[B_G]='G'; 00352 maptable_to_char[B_T]='T'; 00353 break; 00354 case RAWDNA: 00355 { 00356 //identity 00357 for (int32_t i=0; i<4; i++) 00358 { 00359 valid_chars[i]=true; 00360 maptable_to_bin[i]=i; 00361 maptable_to_char[i]=i; 00362 } 00363 } 00364 break; 00365 00366 case SNP: 00367 valid_chars[(uint8_t) 'A']=true; 00368 valid_chars[(uint8_t) 'C']=true; 00369 valid_chars[(uint8_t) 'G']=true; 00370 valid_chars[(uint8_t) 'T']=true; 00371 valid_chars[(uint8_t) '0']=true; 00372 00373 maptable_to_bin[(uint8_t) 'A']=B_A; 00374 maptable_to_bin[(uint8_t) 'C']=B_C; 00375 maptable_to_bin[(uint8_t) 'G']=B_G; 00376 maptable_to_bin[(uint8_t) 'T']=B_T; 00377 maptable_to_bin[(uint8_t) '0']=B_0; 00378 00379 maptable_to_char[B_A]='A'; 00380 maptable_to_char[B_C]='C'; 00381 maptable_to_char[B_G]='G'; 00382 maptable_to_char[B_T]='T'; 00383 maptable_to_char[B_0]='0'; 00384 break; 00385 case RAWSNP: 00386 { 00387 //identity 00388 for (int32_t i=0; i<5; i++) 00389 { 00390 valid_chars[i]=true; 00391 maptable_to_bin[i]=i; 00392 maptable_to_char[i]=i; 00393 } 00394 } 00395 break; 00396 00397 case RNA: 00398 valid_chars[(uint8_t) 'A']=true; 00399 valid_chars[(uint8_t) 'C']=true; 00400 valid_chars[(uint8_t) 'G']=true; 00401 valid_chars[(uint8_t) 'U']=true; 00402 00403 maptable_to_bin[(uint8_t) 'A']=B_A; 00404 maptable_to_bin[(uint8_t) 'C']=B_C; 00405 maptable_to_bin[(uint8_t) 'G']=B_G; 00406 maptable_to_bin[(uint8_t) 'U']=B_T; 00407 00408 maptable_to_char[B_A]='A'; 00409 maptable_to_char[B_C]='C'; 00410 maptable_to_char[B_G]='G'; 00411 maptable_to_char[B_T]='U'; 00412 break; 00413 00414 case IUPAC_NUCLEIC_ACID: 00415 valid_chars[(uint8_t) 'A']=true; // A Adenine 00416 valid_chars[(uint8_t) 'C']=true; // C Cytosine 00417 valid_chars[(uint8_t) 'G']=true; // G Guanine 00418 valid_chars[(uint8_t) 'T']=true; // T Thymine 00419 valid_chars[(uint8_t) 'U']=true; // U Uracil 00420 valid_chars[(uint8_t) 'R']=true; // R Purine (A or G) 00421 valid_chars[(uint8_t) 'Y']=true; // Y Pyrimidine (C, T, or U) 00422 valid_chars[(uint8_t) 'M']=true; // M C or A 00423 valid_chars[(uint8_t) 'K']=true; // K T, U, or G 00424 valid_chars[(uint8_t) 'W']=true; // W T, U, or A 00425 valid_chars[(uint8_t) 'S']=true; // S C or G 00426 valid_chars[(uint8_t) 'B']=true; // B C, T, U, or G (not A) 00427 valid_chars[(uint8_t) 'D']=true; // D A, T, U, or G (not C) 00428 valid_chars[(uint8_t) 'H']=true; // H A, T, U, or C (not G) 00429 valid_chars[(uint8_t) 'V']=true; // V A, C, or G (not T, not U) 00430 valid_chars[(uint8_t) 'N']=true; // N Any base (A, C, G, T, or U) 00431 00432 maptable_to_bin[(uint8_t) 'A']=0; // A Adenine 00433 maptable_to_bin[(uint8_t) 'C']=1; // C Cytosine 00434 maptable_to_bin[(uint8_t) 'G']=2; // G Guanine 00435 maptable_to_bin[(uint8_t) 'T']=3; // T Thymine 00436 maptable_to_bin[(uint8_t) 'U']=4; // U Uracil 00437 maptable_to_bin[(uint8_t) 'R']=5; // R Purine (A or G) 00438 maptable_to_bin[(uint8_t) 'Y']=6; // Y Pyrimidine (C, T, or U) 00439 maptable_to_bin[(uint8_t) 'M']=7; // M C or A 00440 maptable_to_bin[(uint8_t) 'K']=8; // K T, U, or G 00441 maptable_to_bin[(uint8_t) 'W']=9; // W T, U, or A 00442 maptable_to_bin[(uint8_t) 'S']=10; // S C or G 00443 maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A) 00444 maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C) 00445 maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G) 00446 maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U) 00447 maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U) 00448 00449 maptable_to_char[0]=(uint8_t) 'A'; // A Adenine 00450 maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine 00451 maptable_to_char[2]=(uint8_t) 'G'; // G Guanine 00452 maptable_to_char[3]=(uint8_t) 'T'; // T Thymine 00453 maptable_to_char[4]=(uint8_t) 'U'; // U Uracil 00454 maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G) 00455 maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U) 00456 maptable_to_char[7]=(uint8_t) 'M'; // M C or A 00457 maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G 00458 maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A 00459 maptable_to_char[10]=(uint8_t) 'S'; // S C or G 00460 maptable_to_char[11]=(uint8_t) 'B'; // B C, T, U, or G (not A) 00461 maptable_to_char[12]=(uint8_t) 'D'; // D A, T, U, or G (not C) 00462 maptable_to_char[13]=(uint8_t) 'H'; // H A, T, U, or C (not G) 00463 maptable_to_char[14]=(uint8_t) 'V'; // V A, C, or G (not T, not U) 00464 maptable_to_char[15]=(uint8_t) 'N'; // N Any base (A, C, G, T, or U) 00465 break; 00466 00467 case IUPAC_AMINO_ACID: 00468 valid_chars[(uint8_t) 'A']=true; //A Ala Alanine 00469 valid_chars[(uint8_t) 'R']=true; //R Arg Arginine 00470 valid_chars[(uint8_t) 'N']=true; //N Asn Asparagine 00471 valid_chars[(uint8_t) 'D']=true; //D Asp Aspartic acid 00472 valid_chars[(uint8_t) 'C']=true; //C Cys Cysteine 00473 valid_chars[(uint8_t) 'Q']=true; //Q Gln Glutamine 00474 valid_chars[(uint8_t) 'E']=true; //E Glu Glutamic acid 00475 valid_chars[(uint8_t) 'G']=true; //G Gly Glycine 00476 valid_chars[(uint8_t) 'H']=true; //H His Histidine 00477 valid_chars[(uint8_t) 'I']=true; //I Ile Isoleucine 00478 valid_chars[(uint8_t) 'L']=true; //L Leu Leucine 00479 valid_chars[(uint8_t) 'K']=true; //K Lys Lysine 00480 valid_chars[(uint8_t) 'M']=true; //M Met Methionine 00481 valid_chars[(uint8_t) 'F']=true; //F Phe Phenylalanine 00482 valid_chars[(uint8_t) 'P']=true; //P Pro Proline 00483 valid_chars[(uint8_t) 'S']=true; //S Ser Serine 00484 valid_chars[(uint8_t) 'T']=true; //T Thr Threonine 00485 valid_chars[(uint8_t) 'W']=true; //W Trp Tryptophan 00486 valid_chars[(uint8_t) 'Y']=true; //Y Tyr Tyrosine 00487 valid_chars[(uint8_t) 'V']=true; //V Val Valine 00488 valid_chars[(uint8_t) 'B']=true; //B Asx Aspartic acid or Asparagine 00489 valid_chars[(uint8_t) 'Z']=true; //Z Glx Glutamine or Glutamic acid 00490 valid_chars[(uint8_t) 'X']=true; //X Xaa Any amino acid 00491 00492 maptable_to_bin[(uint8_t) 'A']=0; //A Ala Alanine 00493 maptable_to_bin[(uint8_t) 'R']=1; //R Arg Arginine 00494 maptable_to_bin[(uint8_t) 'N']=2; //N Asn Asparagine 00495 maptable_to_bin[(uint8_t) 'D']=3; //D Asp Aspartic acid 00496 maptable_to_bin[(uint8_t) 'C']=4; //C Cys Cysteine 00497 maptable_to_bin[(uint8_t) 'Q']=5; //Q Gln Glutamine 00498 maptable_to_bin[(uint8_t) 'E']=6; //E Glu Glutamic acid 00499 maptable_to_bin[(uint8_t) 'G']=7; //G Gly Glycine 00500 maptable_to_bin[(uint8_t) 'H']=8; //H His Histidine 00501 maptable_to_bin[(uint8_t) 'I']=9; //I Ile Isoleucine 00502 maptable_to_bin[(uint8_t) 'L']=10; //L Leu Leucine 00503 maptable_to_bin[(uint8_t) 'K']=11; //K Lys Lysine 00504 maptable_to_bin[(uint8_t) 'M']=12; //M Met Methionine 00505 maptable_to_bin[(uint8_t) 'F']=13; //F Phe Phenylalanine 00506 maptable_to_bin[(uint8_t) 'P']=14; //P Pro Proline 00507 maptable_to_bin[(uint8_t) 'S']=15; //S Ser Serine 00508 maptable_to_bin[(uint8_t) 'T']=16; //T Thr Threonine 00509 maptable_to_bin[(uint8_t) 'W']=17; //W Trp Tryptophan 00510 maptable_to_bin[(uint8_t) 'Y']=18; //Y Tyr Tyrosine 00511 maptable_to_bin[(uint8_t) 'V']=19; //V Val Valine 00512 maptable_to_bin[(uint8_t) 'B']=20; //B Asx Aspartic acid or Asparagine 00513 maptable_to_bin[(uint8_t) 'Z']=21; //Z Glx Glutamine or Glutamic acid 00514 maptable_to_bin[(uint8_t) 'X']=22; //X Xaa Any amino acid 00515 00516 maptable_to_char[0]=(uint8_t) 'A'; //A Ala Alanine 00517 maptable_to_char[1]=(uint8_t) 'R'; //R Arg Arginine 00518 maptable_to_char[2]=(uint8_t) 'N'; //N Asn Asparagine 00519 maptable_to_char[3]=(uint8_t) 'D'; //D Asp Aspartic acid 00520 maptable_to_char[4]=(uint8_t) 'C'; //C Cys Cysteine 00521 maptable_to_char[5]=(uint8_t) 'Q'; //Q Gln Glutamine 00522 maptable_to_char[6]=(uint8_t) 'E'; //E Glu Glutamic acid 00523 maptable_to_char[7]=(uint8_t) 'G'; //G Gly Glycine 00524 maptable_to_char[8]=(uint8_t) 'H'; //H His Histidine 00525 maptable_to_char[9]=(uint8_t) 'I'; //I Ile Isoleucine 00526 maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine 00527 maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine 00528 maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine 00529 maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine 00530 maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline 00531 maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine 00532 maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine 00533 maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan 00534 maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine 00535 maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine 00536 maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine 00537 maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid 00538 maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid 00539 break; 00540 00541 default: 00542 break; //leave uninitialised 00543 }; 00544 } 00545 00546 void CAlphabet::clear_histogram() 00547 { 00548 memset(histogram, 0, sizeof(histogram)); 00549 print_histogram(); 00550 } 00551 00552 int32_t CAlphabet::get_max_value_in_histogram() 00553 { 00554 int32_t max_sym=-1; 00555 for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--) 00556 { 00557 if (histogram[i]) 00558 { 00559 max_sym=i; 00560 break; 00561 } 00562 } 00563 00564 return max_sym; 00565 } 00566 00567 int32_t CAlphabet::get_num_symbols_in_histogram() 00568 { 00569 int32_t num_sym=0; 00570 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00571 { 00572 if (histogram[i]) 00573 num_sym++; 00574 } 00575 00576 return num_sym; 00577 } 00578 00579 int32_t CAlphabet::get_num_bits_in_histogram() 00580 { 00581 int32_t num_sym=get_num_symbols_in_histogram(); 00582 if (num_sym>0) 00583 return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2)); 00584 else 00585 return 0; 00586 } 00587 00588 void CAlphabet::print_histogram() 00589 { 00590 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00591 { 00592 if (histogram[i]) 00593 SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]); 00594 } 00595 } 00596 00597 bool CAlphabet::check_alphabet(bool print_error) 00598 { 00599 bool result = true; 00600 00601 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00602 { 00603 if (histogram[i]>0 && valid_chars[i]==0) 00604 { 00605 result=false; 00606 break; 00607 } 00608 } 00609 00610 if (!result && print_error) 00611 { 00612 print_histogram(); 00613 SG_ERROR( "ALPHABET does not contain all symbols in histogram\n"); 00614 } 00615 00616 return result; 00617 } 00618 00619 bool CAlphabet::check_alphabet_size(bool print_error) 00620 { 00621 if (get_num_bits_in_histogram() > get_num_bits()) 00622 { 00623 if (print_error) 00624 { 00625 print_histogram(); 00626 fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ; 00627 SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n"); 00628 } 00629 return false; 00630 } 00631 else 00632 return true; 00633 00634 } 00635 00636 void CAlphabet::copy_histogram(CAlphabet* a) 00637 { 00638 memcpy(histogram, a->get_histogram(), sizeof(histogram)); 00639 } 00640 00641 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet) 00642 { 00643 00644 int32_t idx; 00645 switch (alphabet) 00646 { 00647 case DNA: 00648 idx=0; 00649 break; 00650 case RAWDNA: 00651 idx=1; 00652 break; 00653 case RNA: 00654 idx=2; 00655 break; 00656 case PROTEIN: 00657 idx=3; 00658 break; 00659 case BINARY: 00660 idx=4; 00661 break; 00662 case ALPHANUM: 00663 idx=5; 00664 break; 00665 case CUBE: 00666 idx=6; 00667 break; 00668 case RAWBYTE: 00669 idx=7; 00670 break; 00671 case IUPAC_NUCLEIC_ACID: 00672 idx=8; 00673 break; 00674 case IUPAC_AMINO_ACID: 00675 idx=9; 00676 break; 00677 case NONE: 00678 idx=10; 00679 break; 00680 case DIGIT: 00681 idx=11; 00682 break; 00683 case DIGIT2: 00684 idx=12; 00685 break; 00686 default: 00687 idx=13; 00688 break; 00689 } 00690 return alphabet_names[idx]; 00691 } 00692