|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Soeren Sonnenburg 00008 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #ifndef _CALPHABET__H__ 00012 #define _CALPHABET__H__ 00013 00014 #include "base/SGObject.h" 00015 #include "lib/Mathematics.h" 00016 #include "lib/common.h" 00017 00018 namespace shogun 00019 { 00021 enum EAlphabet 00022 { 00024 DNA=0, 00025 00027 RAWDNA=1, 00028 00030 RNA=2, 00031 00033 PROTEIN=3, 00034 00035 // BINARY just 0 and 1 00036 BINARY=4, 00037 00039 ALPHANUM=5, 00040 00042 CUBE=6, 00043 00045 RAWBYTE=7, 00046 00048 IUPAC_NUCLEIC_ACID=8, 00049 00051 IUPAC_AMINO_ACID=9, 00052 00054 NONE=10, 00055 00057 DIGIT=11, 00058 00060 DIGIT2=12, 00061 00063 RAWDIGIT=13, 00064 00066 RAWDIGIT2=14, 00067 00069 UNKNOWN=15, 00070 00072 SNP=16, 00073 00075 RAWSNP=17 00076 }; 00077 00078 00089 class CAlphabet : public CSGObject 00090 { 00091 public: 00092 00096 //CAlphabet(); 00097 00098 00104 CAlphabet(char* alpha, int32_t len); 00105 00110 CAlphabet(EAlphabet alpha); 00111 00116 CAlphabet(CAlphabet* alpha); 00117 virtual ~CAlphabet(); 00118 00123 bool set_alphabet(EAlphabet alpha); 00124 00129 inline EAlphabet get_alphabet() const 00130 { 00131 return alphabet; 00132 } 00133 00138 inline int32_t get_num_symbols() const 00139 { 00140 return num_symbols; 00141 } 00142 00148 inline int32_t get_num_bits() const 00149 { 00150 return num_bits; 00151 } 00152 00158 inline uint8_t remap_to_bin(uint8_t c) 00159 { 00160 return maptable_to_bin[c]; 00161 } 00162 00168 inline uint8_t remap_to_char(uint8_t c) 00169 { 00170 return maptable_to_char[c]; 00171 } 00172 00174 void clear_histogram(); 00175 00181 template <class T> 00182 void add_string_to_histogram(T* p, int64_t len) 00183 { 00184 for (int64_t i=0; i<len; i++) 00185 add_byte_to_histogram((uint8_t) (p[i])); 00186 } 00187 00192 inline void add_byte_to_histogram(uint8_t p) 00193 { 00194 histogram[p]++; 00195 } 00196 00198 void print_histogram(); 00199 00205 inline void get_hist(int64_t** h, int32_t* len) 00206 { 00207 int32_t hist_size=(1 << (sizeof(uint8_t)*8)); 00208 ASSERT(h && len); 00209 *h=(int64_t*) malloc(sizeof(int64_t)*hist_size); 00210 ASSERT(*h); 00211 *len=hist_size; 00212 ASSERT(*len); 00213 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size); 00214 } 00215 00217 inline const int64_t* get_histogram() 00218 { 00219 return &histogram[0]; 00220 } 00221 00228 bool check_alphabet(bool print_error=true); 00229 00236 inline bool is_valid(uint8_t c) 00237 { 00238 return valid_chars[c]; 00239 } 00240 00246 bool check_alphabet_size(bool print_error=true); 00247 00252 int32_t get_num_symbols_in_histogram(); 00253 00258 int32_t get_max_value_in_histogram(); 00259 00266 int32_t get_num_bits_in_histogram(); 00267 00272 static const char* get_alphabet_name(EAlphabet alphabet); 00273 00274 00276 inline virtual const char* get_name() const { return "Alphabet"; } 00277 00286 template <class ST> 00287 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) 00288 { 00289 int32_t i,j; 00290 ST value=0; 00291 00292 for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T 00293 { 00294 value=0; 00295 for (j=i; j>=i-p_order+1; j--) 00296 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1))); 00297 00298 obs[i]= (ST) value; 00299 } 00300 00301 for (i=p_order-2;i>=0;i--) 00302 { 00303 if (i>=sequence_length) 00304 continue; 00305 00306 value=0; 00307 for (j=i; j>=i-p_order+1; j--) 00308 { 00309 value= (value >> max_val); 00310 if (j>=0 && j<sequence_length) 00311 value|=obs[j] << (max_val * (p_order-1)); 00312 } 00313 obs[i]=value; 00314 } 00315 00316 // TODO we should get rid of this loop! 00317 if (start>0) 00318 { 00319 for (i=start; i<sequence_length; i++) 00320 obs[i-start]=obs[i]; 00321 } 00322 } 00323 00332 template <class ST> 00333 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) 00334 { 00335 int32_t i,j; 00336 ST value=0; 00337 00338 for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T 00339 { 00340 value=0; 00341 for (j=i; j>=i-p_order+1; j--) 00342 value= (value << max_val) | obs[j]; 00343 00344 obs[i]= (ST) value; 00345 } 00346 00347 for (i=p_order-2;i>=0;i--) 00348 { 00349 if (i>=sequence_length) 00350 continue; 00351 00352 value=0; 00353 for (j=i; j>=i-p_order+1; j--) 00354 { 00355 value= (value << max_val); 00356 if (j>=0 && j<sequence_length) 00357 value|=obs[j]; 00358 } 00359 obs[i]=value; 00360 } 00361 00362 // TODO we should get rid of this loop! 00363 if (start>0) 00364 { 00365 for (i=start; i<sequence_length; i++) 00366 obs[i-start]=obs[i]; 00367 } 00368 } 00369 00379 template <class ST> 00380 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00381 { 00382 ASSERT(gap>=0); 00383 00384 const int32_t start_gap=(p_order-gap)/2; 00385 const int32_t end_gap=start_gap+gap; 00386 00387 int32_t i,j; 00388 ST value=0; 00389 00390 // almost all positions 00391 for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T 00392 { 00393 value=0; 00394 for (j=i; j>=i-p_order+1; j--) 00395 { 00396 if (i-j<start_gap) 00397 { 00398 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap))); 00399 } 00400 else if (i-j>=end_gap) 00401 { 00402 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap))); 00403 } 00404 } 00405 obs[i]= (ST) value; 00406 } 00407 00408 // the remaining `order` positions 00409 for (i=p_order-2;i>=0;i--) 00410 { 00411 if (i>=sequence_length) 00412 continue; 00413 00414 value=0; 00415 for (j=i; j>=i-p_order+1; j--) 00416 { 00417 if (i-j<start_gap) 00418 { 00419 value= (value >> max_val); 00420 if (j>=0 && j<sequence_length) 00421 value|=obs[j] << (max_val * (p_order-1-gap)); 00422 } 00423 else if (i-j>=end_gap) 00424 { 00425 value= (value >> max_val); 00426 if (j>=0 && j<sequence_length) 00427 value|=obs[j] << (max_val * (p_order-1-gap)); 00428 } 00429 } 00430 obs[i]=value; 00431 } 00432 00433 // TODO we should get rid of this loop! 00434 if (start>0) 00435 { 00436 for (i=start; i<sequence_length; i++) 00437 obs[i-start]=obs[i]; 00438 } 00439 } 00440 00450 template <class ST> 00451 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00452 { 00453 ASSERT(gap>=0); 00454 00455 const int32_t start_gap=(p_order-gap)/2; 00456 const int32_t end_gap=start_gap+gap; 00457 00458 int32_t i,j; 00459 ST value=0; 00460 00461 // almost all positions 00462 for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T 00463 { 00464 value=0; 00465 for (j=i; j>=i-p_order+1; j--) 00466 { 00467 if (i-j<start_gap) 00468 value= (value << max_val) | obs[j]; 00469 else if (i-j>=end_gap) 00470 value= (value << max_val) | obs[j]; 00471 } 00472 obs[i]= (ST) value; 00473 } 00474 00475 // the remaining `order` positions 00476 for (i=p_order-2;i>=0;i--) 00477 { 00478 if (i>=sequence_length) 00479 continue; 00480 00481 value=0; 00482 for (j=i; j>=i-p_order+1; j--) 00483 { 00484 if (i-j<start_gap) 00485 { 00486 value= value << max_val; 00487 if (j>=0 && j<sequence_length) 00488 value|=obs[j]; 00489 } 00490 else if (i-j>=end_gap) 00491 { 00492 value= value << max_val; 00493 if (j>=0 && j<sequence_length) 00494 value|=obs[j]; 00495 } 00496 } 00497 obs[i]=value; 00498 } 00499 00500 // TODO we should get rid of this loop! 00501 if (start>0) 00502 { 00503 for (i=start; i<sequence_length; i++) 00504 obs[i-start]=obs[i]; 00505 } 00506 } 00507 00508 00509 protected: 00511 void init_map_table(); 00512 00517 void copy_histogram(CAlphabet* src); 00518 00519 00520 00521 #ifdef HAVE_BOOST_SERIALIZATION 00522 protected: 00523 00524 friend class ::boost::serialization::access; 00525 00526 template<class Archive> 00527 00528 void serialize(Archive & ar, const unsigned int archive_version) 00529 { 00530 00531 SG_DEBUG("archiving CAlphabet\n"); 00532 00533 ar & ::boost::serialization::base_object<CSGObject>(*this); 00534 00535 ar & num_bits; 00536 00537 SG_DEBUG("done with CAlphabet\n"); 00538 00539 } 00540 00541 00542 #endif //HAVE_BOOST_SERIALIZATION 00543 00544 00545 public: 00547 static const uint8_t B_A; 00549 static const uint8_t B_C; 00551 static const uint8_t B_G; 00553 static const uint8_t B_T; 00555 static const uint8_t B_0; 00557 static const uint8_t MAPTABLE_UNDEF; 00559 static const char* alphabet_names[18]; 00560 00561 protected: 00563 EAlphabet alphabet; 00565 int32_t num_symbols; 00567 int32_t num_bits; 00569 bool valid_chars[1 << (sizeof(uint8_t)*8)]; 00571 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)]; 00573 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)]; 00575 int64_t histogram[1 << (sizeof(uint8_t)*8)]; 00576 }; 00577 00578 00579 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00580 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00581 { 00582 } 00583 00584 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00585 { 00586 } 00587 00588 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00589 { 00590 } 00591 00592 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00593 { 00594 } 00595 00596 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00597 { 00598 } 00599 00600 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00601 { 00602 } 00603 #endif 00604 00605 } 00606 00607 #ifdef HAVE_BOOST_SERIALIZATION 00608 00609 namespace boost { 00610 namespace serialization { 00611 template<class Archive> 00612 inline void save_construct_data(Archive & ar, shogun::CAlphabet* t, const unsigned int archive_version) 00613 { 00614 std::cout << "archiving construction data CAlphabet" << std::endl; 00615 ar << t->alphabet; 00616 std::cout << "done archiving construction data CAlphabet" << std::endl; 00617 } 00618 00619 template<class Archive> 00620 inline void load_construct_data(Archive & ar, shogun::CAlphabet * t, const unsigned int archive_version) 00621 { 00622 //SG_DEBUG("archiving construction data CAlphabet\n"); 00623 shogun::EAlphabet a; 00624 ar >> a; 00625 new(t)shogun::CAlphabet(a); 00626 //SG_DEBUG("done construction data archiving CAlphabet\n"); 00627 } 00628 } // serialization 00629 } // namespace boost 00630 #endif //HAVE_BOOST_SERIALIZATION 00631 00632 00633 00634 #endif