|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #ifndef _CSTRINGFEATURES__H__ 00013 #define _CSTRINGFEATURES__H__ 00014 00015 #include "lib/common.h" 00016 #include "lib/io.h" 00017 #include "lib/Cache.h" 00018 #include "lib/DynamicArray.h" 00019 #include "lib/File.h" 00020 #include "lib/MemoryMappedFile.h" 00021 #include "lib/Mathematics.h" 00022 #include "lib/Compressor.h" 00023 00024 #include "preproc/PreProc.h" 00025 #include "preproc/StringPreProc.h" 00026 #include "features/Features.h" 00027 #include "features/Alphabet.h" 00028 00029 #include <sys/types.h> 00030 #include <sys/stat.h> 00031 #include <dirent.h> 00032 #include <stdio.h> 00033 #include <stdlib.h> 00034 #include <unistd.h> 00035 00036 namespace shogun 00037 { 00038 class CCompressor; 00039 enum E_COMPRESSION_TYPE; 00040 class CAlphabet; 00041 enum EAlphabet; 00042 template <class T> class CDynamicArray; 00043 class CFile; 00044 template <class T> class CMemoryMappedFile; 00045 class CMath; 00046 template <class ST> class CStringPreProc; 00047 template <class T> class T_STRING; 00048 00049 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00050 00051 template <class T> class T_STRING 00052 { 00053 00054 00055 #ifdef HAVE_BOOST_SERIALIZATION 00056 00057 private: 00058 00059 00060 friend class ::boost::serialization::access; 00061 template<class Archive> 00062 void save(Archive & ar, const unsigned int archive_version) const 00063 { 00064 00065 //SG_DEBUG("archiving T_STRING\n"); 00066 00067 ar & length; 00068 00069 for (int i=0; i < length; ++i) { 00070 ar & string[i]; 00071 } 00072 00073 //SG_DEBUG("done archiving T_STRING\n"); 00074 00075 } 00076 00077 template<class Archive> 00078 void load(Archive & ar, const unsigned int archive_version) 00079 { 00080 00081 //SG_DEBUG("archiving T_STRING\n"); 00082 00083 ar & length; 00084 00085 string = new T[length]; 00086 00087 for (int i=0; i < length; ++i) { 00088 ar & string[i]; 00089 } 00090 00091 //SG_DEBUG("done archiving T_STRING\n"); 00092 00093 } 00094 00095 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER(); 00096 00097 00098 #endif //HAVE_BOOST_SERIALIZATION 00099 00100 public: 00102 T* string; 00104 int32_t length; 00105 }; 00106 #endif // DOXYGEN_SHOULD_SKIP_THIS 00107 00108 00127 template <class ST> class CStringFeatures : public CFeatures 00128 { 00129 public: 00133 CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0), 00134 features(NULL), single_string(NULL),length_of_single_string(0), 00135 max_string_length(0), order(0), symbol_mask_table(NULL), 00136 preprocess_on_get(false), feature_cache(NULL) 00137 { 00138 } 00139 00144 CStringFeatures(EAlphabet alpha) 00145 : CFeatures(0), num_vectors(0), features(NULL), 00146 single_string(NULL),length_of_single_string(0), 00147 max_string_length(0), order(0), symbol_mask_table(NULL), 00148 preprocess_on_get(false), feature_cache(NULL) 00149 { 00150 alphabet=new CAlphabet(alpha); 00151 SG_REF(alphabet); 00152 num_symbols=alphabet->get_num_symbols(); 00153 original_num_symbols=num_symbols; 00154 } 00155 00163 CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors, 00164 int32_t p_max_string_length, EAlphabet alpha) 00165 : CFeatures(0), num_vectors(0), features(NULL), 00166 single_string(NULL),length_of_single_string(0), 00167 max_string_length(0), order(0), symbol_mask_table(NULL), 00168 preprocess_on_get(false), feature_cache(NULL) 00169 { 00170 alphabet=new CAlphabet(alpha); 00171 SG_REF(alphabet); 00172 num_symbols=alphabet->get_num_symbols(); 00173 original_num_symbols=num_symbols; 00174 set_features(p_features, p_num_vectors, p_max_string_length); 00175 } 00176 00184 CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors, 00185 int32_t p_max_string_length, CAlphabet* alpha) 00186 : CFeatures(0), num_vectors(0), features(NULL), 00187 single_string(NULL),length_of_single_string(0), 00188 max_string_length(0), order(0), symbol_mask_table(NULL), 00189 preprocess_on_get(false), feature_cache(NULL) 00190 { 00191 alphabet=new CAlphabet(alpha); 00192 SG_REF(alphabet); 00193 num_symbols=alphabet->get_num_symbols(); 00194 original_num_symbols=num_symbols; 00195 set_features(p_features, p_num_vectors, p_max_string_length); 00196 } 00197 00202 CStringFeatures(CAlphabet* alpha) 00203 : CFeatures(0), num_vectors(0), features(NULL), 00204 single_string(NULL),length_of_single_string(0), 00205 max_string_length(0), order(0), symbol_mask_table(NULL), 00206 preprocess_on_get(false), feature_cache(NULL) 00207 { 00208 ASSERT(alpha); 00209 SG_REF(alpha); 00210 alphabet=alpha; 00211 num_symbols=alphabet->get_num_symbols(); 00212 original_num_symbols=num_symbols; 00213 } 00214 00216 CStringFeatures(const CStringFeatures & orig) 00217 : CFeatures(orig), num_vectors(orig.num_vectors), 00218 single_string(orig.single_string), 00219 length_of_single_string(orig.length_of_single_string), 00220 max_string_length(orig.max_string_length), 00221 num_symbols(orig.num_symbols), 00222 original_num_symbols(orig.original_num_symbols), 00223 order(orig.order), preprocess_on_get(false), 00224 feature_cache(NULL) 00225 { 00226 ASSERT(orig.single_string == NULL); //not implemented 00227 00228 alphabet=orig.alphabet; 00229 SG_REF(alphabet); 00230 00231 if (orig.features) 00232 { 00233 features=new T_STRING<ST>[orig.num_vectors]; 00234 00235 for (int32_t i=0; i<num_vectors; i++) 00236 { 00237 features[i].string=new ST[orig.features[i].length]; 00238 features[i].length=orig.features[i].length; 00239 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length); 00240 } 00241 } 00242 00243 if (orig.symbol_mask_table) 00244 { 00245 symbol_mask_table=new ST[256]; 00246 for (int32_t i=0; i<256; i++) 00247 symbol_mask_table[i]=orig.symbol_mask_table[i]; 00248 } 00249 } 00250 00256 CStringFeatures(CFile* loader, EAlphabet alpha=DNA) 00257 : CFeatures(loader), num_vectors(0), features(NULL), single_string(NULL), 00258 length_of_single_string(0), max_string_length(0), order(0), 00259 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL) 00260 { 00261 alphabet=new CAlphabet(alpha); 00262 SG_REF(alphabet); 00263 num_symbols=alphabet->get_num_symbols(); 00264 original_num_symbols=num_symbols; 00265 load(loader); 00266 } 00267 00268 virtual ~CStringFeatures() 00269 { 00270 cleanup(); 00271 00272 SG_UNREF(alphabet); 00273 } 00274 00276 virtual void cleanup() 00277 { 00278 if (single_string) 00279 { 00280 delete[] single_string; 00281 single_string=NULL; 00282 } 00283 else 00284 { 00285 for (int32_t i=0; i<num_vectors; i++) 00286 cleanup_feature_vector(i); 00287 } 00288 00289 num_vectors=0; 00290 delete[] features; 00291 delete[] symbol_mask_table; 00292 features=NULL; 00293 symbol_mask_table=NULL; 00294 00295 /* start with a fresh alphabet, but instead of emptying the histogram 00296 * create a new object (to leave the alphabet object alone if it is used 00297 * by others) 00298 */ 00299 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00300 SG_UNREF(alphabet); 00301 alphabet=alpha; 00302 SG_REF(alphabet); 00303 } 00304 00306 virtual void cleanup_feature_vector(int32_t num) 00307 { 00308 ASSERT(num<num_vectors); 00309 if (features) 00310 { 00311 delete[] features[num].string; 00312 features[num].string=NULL; 00313 features[num].length=0; 00314 } 00315 } 00316 00321 inline virtual EFeatureClass get_feature_class() { return C_STRING; } 00322 00327 inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; } 00328 00333 inline CAlphabet* get_alphabet() 00334 { 00335 SG_REF(alphabet); 00336 return alphabet; 00337 } 00338 00343 virtual CFeatures* duplicate() const 00344 { 00345 return new CStringFeatures<ST>(*this); 00346 } 00347 00354 void get_feature_vector(ST** dst, int32_t* len, int32_t num) 00355 { 00356 ASSERT(features); 00357 if (num>=num_vectors) 00358 { 00359 SG_ERROR("Index out of bounds (number of strings %d, you " 00360 "requested %d)\n", num_vectors, num); 00361 } 00362 00363 int32_t l; 00364 bool free_vec; 00365 ST* vec=get_feature_vector(num, l, free_vec); 00366 *len=l; 00367 *dst=(ST*) malloc(*len * sizeof(ST)); 00368 ASSERT(*dst); 00369 memcpy(*dst, vec, *len * sizeof(ST)); 00370 free_feature_vector(vec, num, free_vec); 00371 } 00372 00379 void set_feature_vector(ST* src, int32_t len, int32_t num) 00380 { 00381 ASSERT(features); 00382 if (num>=num_vectors) 00383 { 00384 SG_ERROR("Index out of bounds (number of strings %d, you " 00385 "requested %d)\n", num_vectors, num); 00386 } 00387 00388 if (len<=0) 00389 SG_ERROR("String has zero or negative length\n"); 00390 00391 00392 cleanup_feature_vector(num); 00393 features[num].length=len; 00394 features[num].string=new ST[len]; 00395 memcpy(features[num].string, src, len*sizeof(ST)); 00396 00397 determine_maximum_string_length(); 00398 } 00399 00402 void enable_on_the_fly_preprocessing() 00403 { 00404 preprocess_on_get=true; 00405 } 00406 00410 void disable_on_the_fly_preprocessing() 00411 { 00412 preprocess_on_get=false; 00413 } 00414 00423 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree) 00424 { 00425 ASSERT(features); 00426 ASSERT(num<num_vectors); 00427 00428 if (!preprocess_on_get) 00429 { 00430 dofree=false; 00431 len=features[num].length; 00432 return features[num].string; 00433 } 00434 else 00435 { 00436 SG_DEBUG( "computing feature vector!\n") ; 00437 ST* feat=compute_feature_vector(num, len); 00438 dofree=true; 00439 00440 if (get_num_preproc()) 00441 { 00442 ST* tmp_feat_before = feat; 00443 00444 for (int32_t i=0; i<get_num_preproc(); i++) 00445 { 00446 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i); 00447 feat=p->apply_to_string(tmp_feat_before, len); 00448 SG_UNREF(p); 00449 delete[] tmp_feat_before; 00450 tmp_feat_before=feat; 00451 } 00452 } 00453 // TODO: implement caching 00454 return feat; 00455 } 00456 } 00457 00462 CStringFeatures<ST>* get_transposed() 00463 { 00464 int32_t num_feat; 00465 int32_t num_vec; 00466 T_STRING<ST>* s=get_transposed(num_feat, num_vec); 00467 00468 return new CStringFeatures<ST>(s, num_vec, num_feat, alphabet); 00469 } 00470 00482 T_STRING<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec) 00483 { 00484 num_feat=num_vectors; 00485 num_vec=get_max_vector_length(); 00486 ASSERT(have_same_length()); 00487 00488 SG_DEBUG("Allocating memory for transposed string features of size %ld\n", 00489 int64_t(num_feat)*num_vec); 00490 00491 T_STRING<ST>* sf=new T_STRING<ST>[num_vec]; 00492 00493 for (int32_t i=0; i<num_vec; i++) 00494 { 00495 sf[i].string=new ST[num_feat]; 00496 sf[i].length=num_feat; 00497 } 00498 00499 for (int32_t i=0; i<num_feat; i++) 00500 { 00501 int32_t len=0; 00502 bool free_vec=false; 00503 ST* vec=get_feature_vector(i, len, free_vec); 00504 00505 for (int32_t j=0; j<num_vec; j++) 00506 sf[j].string[i]=vec[j]; 00507 00508 free_feature_vector(vec, i, free_vec); 00509 } 00510 return sf; 00511 } 00512 00519 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree) 00520 { 00521 if (feature_cache) 00522 feature_cache->unlock_entry(num); 00523 00524 if (dofree) 00525 delete[] feat_vec ; 00526 } 00527 00534 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num) 00535 { 00536 int32_t len; 00537 bool free_vec; 00538 ST* vec=get_feature_vector(vec_num, len, free_vec); 00539 ASSERT(feat_num<len); 00540 ST result=vec[feat_num]; 00541 free_feature_vector(vec, vec_num, free_vec); 00542 00543 return result; 00544 } 00545 00551 virtual inline int32_t get_vector_length(int32_t vec_num) 00552 { 00553 int32_t len; 00554 bool free_vec; 00555 ST* vec=get_feature_vector(vec_num, len, free_vec); 00556 free_feature_vector(vec, vec_num, free_vec); 00557 return len; 00558 } 00559 00564 virtual inline int32_t get_max_vector_length() 00565 { 00566 return max_string_length; 00567 } 00568 00573 virtual inline int32_t get_num_vectors() { return num_vectors; } 00574 00581 inline floatmax_t get_num_symbols() { return num_symbols; } 00582 00590 inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); } 00591 00592 // these functions are necessary to find out about a former conversion process 00593 00598 inline floatmax_t get_original_num_symbols() { return original_num_symbols; } 00599 00604 inline int32_t get_order() { return order; } 00605 00613 inline ST get_masked_symbols(ST symbol, uint8_t mask) 00614 { 00615 ASSERT(symbol_mask_table); 00616 return symbol_mask_table[mask] & symbol; 00617 } 00618 00625 inline ST shift_offset(ST offset, int32_t amount) 00626 { 00627 ASSERT(alphabet); 00628 return (offset << (amount*alphabet->get_num_bits())); 00629 } 00630 00637 inline ST shift_symbol(ST symbol, int32_t amount) 00638 { 00639 ASSERT(alphabet); 00640 return (symbol >> (amount*alphabet->get_num_bits())); 00641 } 00642 00647 virtual inline void load(CFile* loader); 00648 00657 void load_ascii_file(char* fname, bool remap_to_bin=true, 00658 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA) 00659 { 00660 size_t blocksize=1024*1024; 00661 size_t required_blocksize=0; 00662 uint8_t* dummy=new uint8_t[blocksize]; 00663 uint8_t* overflow=NULL; 00664 int32_t overflow_len=0; 00665 00666 cleanup(); 00667 00668 CAlphabet* alpha=new CAlphabet(ascii_alphabet); 00669 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet); 00670 00671 FILE* f=fopen(fname, "ro"); 00672 00673 if (f) 00674 { 00675 num_vectors=0; 00676 max_string_length=0; 00677 00678 SG_INFO("counting line numbers in file %s\n", fname); 00679 size_t block_offs=0; 00680 size_t old_block_offs=0; 00681 fseek(f, 0, SEEK_END); 00682 size_t fsize=ftell(f); 00683 rewind(f); 00684 00685 if (blocksize>fsize) 00686 blocksize=fsize; 00687 00688 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize); 00689 00690 size_t sz=blocksize; 00691 while (sz == blocksize) 00692 { 00693 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00694 bool contains_cr=false; 00695 for (size_t i=0; i<sz; i++) 00696 { 00697 block_offs++; 00698 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00699 { 00700 num_vectors++; 00701 contains_cr=true; 00702 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00703 old_block_offs=block_offs; 00704 } 00705 } 00706 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00707 } 00708 00709 SG_INFO("found %d strings\n", num_vectors); 00710 delete[] dummy; 00711 blocksize=required_blocksize; 00712 dummy = new uint8_t[blocksize]; 00713 overflow = new uint8_t[blocksize]; 00714 features=new T_STRING<ST>[num_vectors]; 00715 00716 rewind(f); 00717 sz=blocksize; 00718 int32_t lines=0; 00719 while (sz == blocksize) 00720 { 00721 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00722 00723 size_t old_sz=0; 00724 for (size_t i=0; i<sz; i++) 00725 { 00726 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00727 { 00728 int32_t len=i-old_sz; 00729 //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz); 00730 max_string_length=CMath::max(max_string_length, len+overflow_len); 00731 00732 features[lines].length=len; 00733 features[lines].string=new ST[len]; 00734 00735 if (remap_to_bin) 00736 { 00737 for (int32_t j=0; j<overflow_len; j++) 00738 features[lines].string[j]=alpha->remap_to_bin(overflow[j]); 00739 for (int32_t j=0; j<len; j++) 00740 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]); 00741 alpha->add_string_to_histogram(&dummy[old_sz], len); 00742 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length); 00743 } 00744 else 00745 { 00746 for (int32_t j=0; j<overflow_len; j++) 00747 features[lines].string[j]=overflow[j]; 00748 for (int32_t j=0; j<len; j++) 00749 features[lines].string[j+overflow_len]=dummy[old_sz+j]; 00750 alpha->add_string_to_histogram(&dummy[old_sz], len); 00751 alpha->add_string_to_histogram(features[lines].string, features[lines].length); 00752 } 00753 00754 // clear overflow 00755 overflow_len=0; 00756 00757 //CMath::display_vector(features[lines].string, len); 00758 old_sz=i+1; 00759 lines++; 00760 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 00761 } 00762 } 00763 for (size_t i=old_sz; i<sz; i++) 00764 overflow[i-old_sz]=dummy[i]; 00765 00766 overflow_len=sz-old_sz; 00767 } 00768 00769 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00770 { 00771 SG_INFO("file successfully read\n"); 00772 SG_INFO("max_string_length=%d\n", max_string_length); 00773 SG_INFO("num_strings=%d\n", num_vectors); 00774 } 00775 fclose(f); 00776 } 00777 00778 delete[] dummy; 00779 00780 SG_UNREF(alphabet); 00781 00782 if (remap_to_bin) 00783 alphabet = alpha_bin; 00784 else 00785 alphabet = alpha; 00786 SG_REF(alphabet); 00787 num_symbols=alphabet->get_num_symbols(); 00788 } 00789 00796 bool load_fasta_file(const char* fname, bool ignore_invalid=false) 00797 { 00798 int32_t i=0; 00799 uint64_t len=0; 00800 uint64_t offs=0; 00801 int32_t num=0; 00802 int32_t max_len=0; 00803 00804 CMemoryMappedFile<char> f(fname); 00805 00806 while (true) 00807 { 00808 char* s=f.get_line(len, offs); 00809 if (!s) 00810 break; 00811 00812 if (len>0 && s[0]=='>') 00813 num++; 00814 } 00815 00816 if (num==0) 00817 SG_ERROR("No fasta hunks (lines starting with '>') found\n"); 00818 00819 cleanup(); 00820 SG_UNREF(alphabet); 00821 alphabet=new CAlphabet(DNA); 00822 num_symbols=alphabet->get_num_symbols(); 00823 00824 T_STRING<ST>* strings=new T_STRING<ST>[num]; 00825 offs=0; 00826 00827 for (i=0;i<num; i++) 00828 { 00829 uint64_t id_len=0; 00830 char* id=f.get_line(id_len, offs); 00831 00832 char* fasta=f.get_line(len, offs); 00833 char* s=fasta; 00834 int32_t fasta_len=0; 00835 int32_t spanned_lines=0; 00836 00837 while (true) 00838 { 00839 if (!s || len==0) 00840 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len); 00841 00842 if (s[0]=='>' || offs==f.get_size()) 00843 { 00844 offs-=len+1; // seek to beginning 00845 if (offs==f.get_size()) 00846 { 00847 SG_DEBUG("at EOF\n"); 00848 fasta_len+=len; 00849 } 00850 00851 len = fasta_len-spanned_lines; 00852 strings[i].string=new ST[len]; 00853 strings[i].length=len; 00854 00855 ST* str=strings[i].string; 00856 int32_t idx=0; 00857 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines); 00858 00859 for (int32_t j=0; j<fasta_len; j++) 00860 { 00861 if (fasta[j]=='\n') 00862 continue; 00863 00864 ST c = (ST) fasta[j]; 00865 00866 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j])) 00867 c = (ST) 'A'; 00868 00869 if (idx>=len) 00870 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str); 00871 str[idx++]=c; 00872 } 00873 max_len=CMath::max(max_len, strings[i].length); 00874 00875 00876 break; 00877 } 00878 00879 spanned_lines++; 00880 fasta_len+=len+1; // including '\n' 00881 s=f.get_line(len, offs); 00882 } 00883 } 00884 00885 return set_features(strings, num, max_len); 00886 } 00887 00895 bool load_fastq_file(const char* fname, 00896 bool ignore_invalid=false, bool bitremap_in_single_string=false) 00897 { 00898 CMemoryMappedFile<char> f(fname); 00899 00900 int32_t i=0; 00901 uint64_t len=0; 00902 uint64_t offs=0; 00903 00904 int32_t num=f.get_num_lines(); 00905 int32_t max_len=0; 00906 00907 if (num%4) 00908 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n"); 00909 num/=4; 00910 00911 cleanup(); 00912 SG_UNREF(alphabet); 00913 alphabet=new CAlphabet(DNA); 00914 00915 T_STRING<ST>* strings; 00916 00917 ST* str; 00918 if (bitremap_in_single_string) 00919 { 00920 strings=new T_STRING<ST>[1]; 00921 strings[0].string=new ST[num]; 00922 strings[0].length=num; 00923 f.get_line(len, offs); 00924 f.get_line(len, offs); 00925 order=len; 00926 max_len=num; 00927 offs=0; 00928 original_num_symbols=alphabet->get_num_symbols(); 00929 int32_t max_val=alphabet->get_num_bits(); 00930 str=new ST[len]; 00931 } 00932 else 00933 strings=new T_STRING<ST>[num]; 00934 00935 for (i=0;i<num; i++) 00936 { 00937 if (!f.get_line(len, offs)) 00938 SG_ERROR("Error reading 'read' identifier in line %d", 4*i); 00939 00940 char* s=f.get_line(len, offs); 00941 if (!s || len==0) 00942 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len); 00943 00944 if (bitremap_in_single_string) 00945 { 00946 if (len!=order) 00947 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len); 00948 for (int32_t j=0; j<order; j++) 00949 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]); 00950 00951 strings[0].string[i]=embed_word(str, order); 00952 } 00953 else 00954 { 00955 strings[i].string=new ST[len]; 00956 strings[i].length=len; 00957 str=strings[i].string; 00958 00959 if (ignore_invalid) 00960 { 00961 for (int32_t j=0; j<len; j++) 00962 { 00963 if (alphabet->is_valid((uint8_t) s[j])) 00964 str[j]= (ST) s[j]; 00965 else 00966 str[j]= (ST) 'A'; 00967 } 00968 } 00969 else 00970 { 00971 for (int32_t j=0; j<len; j++) 00972 str[j]= (ST) s[j]; 00973 } 00974 max_len=CMath::max(max_len, (int32_t) len); 00975 } 00976 00977 00978 if (!f.get_line(len, offs)) 00979 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2); 00980 00981 if (!f.get_line(len, offs)) 00982 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3); 00983 } 00984 00985 if (bitremap_in_single_string) 00986 num=1; 00987 00988 num_vectors=num; 00989 max_string_length=max_len; 00990 features=strings; 00991 00992 return true; 00993 } 00994 01000 bool load_from_directory(char* dirname) 01001 { 01002 struct dirent **namelist; 01003 int32_t n; 01004 01005 CIO::set_dirname(dirname); 01006 01007 SG_DEBUG("dirname '%s'\n", dirname); 01008 01009 n = scandir(dirname, &namelist, &CIO::filter, alphasort); 01010 if (n <= 0) 01011 { 01012 SG_ERROR("error calling scandir - no files found\n"); 01013 return false; 01014 } 01015 else 01016 { 01017 T_STRING<ST>* strings=NULL; 01018 01019 int32_t num=0; 01020 int32_t max_len=-1; 01021 01022 //usually n==num_vec, but it might not in race conditions 01023 //(file perms modified, file erased) 01024 strings=new T_STRING<ST>[n]; 01025 01026 for (int32_t i=0; i<n; i++) 01027 { 01028 char* fname=CIO::concat_filename(namelist[i]->d_name); 01029 01030 struct stat s; 01031 off_t filesize=0; 01032 01033 if (!stat(fname, &s) && s.st_size>0) 01034 { 01035 filesize=s.st_size/sizeof(ST); 01036 01037 FILE* f=fopen(fname, "ro"); 01038 if (f) 01039 { 01040 ST* str=new ST[filesize]; 01041 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize); 01042 fread(str, sizeof(ST), filesize, f); 01043 strings[num].string=str; 01044 strings[num].length=filesize; 01045 max_len=CMath::max(max_len, strings[num].length); 01046 01047 num++; 01048 fclose(f); 01049 } 01050 } 01051 else 01052 SG_ERROR("empty or non readable file \'%s\'\n", fname); 01053 01054 free(namelist[i]); 01055 } 01056 free(namelist); 01057 01058 if (num>0 && strings) 01059 { 01060 set_features(strings, num, max_len); 01061 return true; 01062 } 01063 } 01064 return false; 01065 } 01066 01074 bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 01075 { 01076 if (p_features) 01077 { 01078 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 01079 01080 //compute histogram for char/byte 01081 for (int32_t i=0; i<p_num_vectors; i++) 01082 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length); 01083 01084 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 01085 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 01086 01087 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 01088 { 01089 cleanup(); 01090 SG_UNREF(alphabet); 01091 01092 alphabet=alpha; 01093 SG_REF(alphabet); 01094 01095 this->features=p_features; 01096 this->num_vectors=p_num_vectors; 01097 this->max_string_length=p_max_string_length; 01098 01099 return true; 01100 } 01101 else 01102 SG_UNREF(alpha); 01103 } 01104 01105 return false; 01106 } 01107 01113 bool append_features(CStringFeatures<ST>* sf) 01114 { 01115 ASSERT(sf); 01116 T_STRING<ST>* new_features = new T_STRING<ST>[sf->num_vectors]; 01117 01118 for (int32_t i=0; i<sf->num_vectors; i++) 01119 { 01120 int32_t length=sf->features[i].length; 01121 new_features[i].string=new ST[length]; 01122 memcpy(new_features[i].string, sf->features[i].string, length); 01123 new_features[i].length=length; 01124 } 01125 return append_features(new_features, sf->num_vectors, 01126 sf->max_string_length); 01127 } 01128 01139 bool append_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 01140 { 01141 if (!features) 01142 return set_features(p_features, p_num_vectors, p_max_string_length); 01143 01144 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 01145 01146 //compute histogram for char/byte 01147 for (int32_t i=0; i<p_num_vectors; i++) 01148 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length); 01149 01150 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 01151 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 01152 01153 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 01154 { 01155 SG_UNREF(alpha); 01156 for (int32_t i=0; i<p_num_vectors; i++) 01157 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].length); 01158 01159 int32_t old_num_vectors=num_vectors; 01160 num_vectors=old_num_vectors+p_num_vectors; 01161 T_STRING<ST>* new_features = new T_STRING<ST>[num_vectors]; 01162 01163 for (int32_t i=0; i<num_vectors; i++) 01164 { 01165 if (i<old_num_vectors) 01166 { 01167 new_features[i].string=features[i].string; 01168 new_features[i].length=features[i].length; 01169 } 01170 else 01171 { 01172 new_features[i].string=p_features[i-old_num_vectors].string; 01173 new_features[i].length=p_features[i-old_num_vectors].length; 01174 } 01175 } 01176 delete[] features; 01177 delete[] p_features; // free now obsolete features 01178 01179 this->features=new_features; 01180 this->max_string_length=CMath::max(max_string_length, p_max_string_length); 01181 01182 return true; 01183 } 01184 SG_UNREF(alpha); 01185 01186 return false; 01187 } 01188 01195 virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len) 01196 { 01197 num_str=num_vectors; 01198 max_str_len=max_string_length; 01199 return features; 01200 } 01201 01208 virtual T_STRING<ST>* copy_features(int32_t& num_str, int32_t& max_str_len) 01209 { 01210 ASSERT(num_vectors>0); 01211 01212 num_str=num_vectors; 01213 max_str_len=max_string_length; 01214 T_STRING<ST>* new_feat=new T_STRING<ST>[num_str]; 01215 01216 for (int32_t i=0; i<num_str; i++) 01217 { 01218 int32_t len; 01219 bool free_vec; 01220 ST* vec=get_feature_vector(i, len, free_vec); 01221 new_feat[i].string=new ST[len]; 01222 new_feat[i].length=len; 01223 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST)); 01224 free_feature_vector(vec, i, free_vec); 01225 } 01226 01227 return new_feat; 01228 } 01229 01235 virtual void get_features(T_STRING<ST>** dst, int32_t* num_str) 01236 { 01237 int32_t num_vec; 01238 int32_t max_str_len; 01239 *dst=copy_features(num_vec, max_str_len); 01240 *num_str=num_vec; 01241 } 01242 01247 virtual inline void save(CFile* writer); 01248 01255 virtual bool load_compressed(char* src, bool decompress) 01256 { 01257 FILE* file=NULL; 01258 01259 if (!(file=fopen(src, "r"))) 01260 return false; 01261 cleanup(); 01262 01263 // header shogun v0 01264 char id[4]; 01265 fread(&id[0], sizeof(char), 1, file); 01266 ASSERT(id[0]=='S'); 01267 fread(&id[1], sizeof(char), 1, file); 01268 ASSERT(id[1]=='G'); 01269 fread(&id[2], sizeof(char), 1, file); 01270 ASSERT(id[2]=='V'); 01271 fread(&id[3], sizeof(char), 1, file); 01272 ASSERT(id[3]=='0'); 01273 01274 //compression type 01275 uint8_t c; 01276 fread(&c, sizeof(uint8_t), 1, file); 01277 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c); 01278 //alphabet 01279 uint8_t a; 01280 delete alphabet; 01281 fread(&a, sizeof(uint8_t), 1, file); 01282 alphabet=new CAlphabet((EAlphabet) a); 01283 // number of vectors 01284 fread(&num_vectors, sizeof(int32_t), 1, file); 01285 ASSERT(num_vectors>0); 01286 // maximum string length 01287 fread(&max_string_length, sizeof(int32_t), 1, file); 01288 ASSERT(max_string_length>0); 01289 01290 features=new T_STRING<ST>[num_vectors]; 01291 01292 // vectors 01293 for (int32_t i=0; i<num_vectors; i++) 01294 { 01295 // vector len compressed 01296 int32_t len_compressed; 01297 fread(&len_compressed, sizeof(int32_t), 1, file); 01298 // vector len uncompressed 01299 int32_t len_uncompressed; 01300 fread(&len_uncompressed, sizeof(int32_t), 1, file); 01301 01302 // vector raw data 01303 if (decompress) 01304 { 01305 features[i].string=new ST[len_uncompressed]; 01306 features[i].length=len_uncompressed; 01307 uint8_t* compressed=new uint8_t[len_compressed]; 01308 fread(compressed, len_compressed, 1, file); 01309 uint64_t uncompressed_size=len_uncompressed; 01310 uncompressed_size*=sizeof(ST); 01311 compressor->decompress(compressed, len_compressed, 01312 (uint8_t*) features[i].string, uncompressed_size); 01313 delete[] compressed; 01314 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST)); 01315 } 01316 else 01317 { 01318 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST)); 01319 features[i].string=new ST[len_compressed+offs]; 01320 features[i].length=len_compressed+offs; 01321 int32_t* feat32ptr=((int32_t*) (features[i].string)); 01322 memset(features[i].string, 0, offs*sizeof(ST)); 01323 feat32ptr[0]=(int32_t) len_compressed; 01324 feat32ptr[1]=(int32_t) len_uncompressed; 01325 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]); 01326 fread(compressed, len_compressed, 1, file); 01327 } 01328 } 01329 01330 delete compressor; 01331 fclose(file); 01332 return false; 01333 } 01334 01342 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level) 01343 { 01344 FILE* file=NULL; 01345 01346 if (!(file=fopen(dest, "wb"))) 01347 return false; 01348 01349 CCompressor* compressor= new CCompressor(compression); 01350 01351 // header shogun v0 01352 const char* id="SGV0"; 01353 fwrite(&id[0], sizeof(char), 1, file); 01354 fwrite(&id[1], sizeof(char), 1, file); 01355 fwrite(&id[2], sizeof(char), 1, file); 01356 fwrite(&id[3], sizeof(char), 1, file); 01357 01358 //compression type 01359 uint8_t c=(uint8_t) compression; 01360 fwrite(&c, sizeof(uint8_t), 1, file); 01361 //alphabet 01362 uint8_t a=(uint8_t) alphabet->get_alphabet(); 01363 fwrite(&a, sizeof(uint8_t), 1, file); 01364 // number of vectors 01365 fwrite(&num_vectors, sizeof(int32_t), 1, file); 01366 // maximum string length 01367 fwrite(&max_string_length, sizeof(int32_t), 1, file); 01368 01369 // vectors 01370 for (int32_t i=0; i<num_vectors; i++) 01371 { 01372 int32_t len=-1; 01373 bool vfree; 01374 ST* vec=get_feature_vector(i, len, vfree); 01375 01376 uint8_t* compressed=NULL; 01377 uint64_t compressed_size=0; 01378 01379 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST), 01380 compressed, compressed_size, level); 01381 01382 int32_t len_compressed = (int32_t) compressed_size; 01383 // vector len compressed in bytes 01384 fwrite(&len_compressed, sizeof(int32_t), 1, file); 01385 // vector len uncompressed in number of elements of type ST 01386 fwrite(&len, sizeof(int32_t), 1, file); 01387 // vector raw data 01388 fwrite(compressed, compressed_size, 1, file); 01389 delete[] compressed; 01390 01391 free_feature_vector(vec, i, vfree); 01392 } 01393 01394 delete compressor; 01395 fclose(file); 01396 return true; 01397 } 01398 01399 01404 virtual int32_t get_size() { return sizeof(ST); } 01405 01411 virtual bool apply_preproc(bool force_preprocessing=false) 01412 { 01413 SG_DEBUG( "force: %d\n", force_preprocessing); 01414 01415 for (int32_t i=0; i<get_num_preproc(); i++) 01416 { 01417 if ( (!is_preprocessed(i) || force_preprocessing) ) 01418 { 01419 set_preprocessed(i); 01420 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i); 01421 SG_INFO( "preprocessing using preproc %s\n", p->get_name()); 01422 01423 if (!p->apply_to_string_features(this)) 01424 { 01425 SG_UNREF(p); 01426 return false; 01427 } 01428 else 01429 SG_UNREF(p); 01430 } 01431 } 01432 return true; 01433 } 01434 01444 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0) 01445 { 01446 ASSERT(step_size>0); 01447 ASSERT(window_size>0); 01448 ASSERT(num_vectors==1 || single_string); 01449 ASSERT(max_string_length>=window_size || 01450 (single_string && length_of_single_string>=window_size)); 01451 01452 //in case we are dealing with a single remapped string 01453 //allow remapping 01454 if (single_string) 01455 num_vectors= (length_of_single_string-window_size)/step_size + 1; 01456 else if (num_vectors==1) 01457 { 01458 num_vectors= (max_string_length-window_size)/step_size + 1; 01459 length_of_single_string=max_string_length; 01460 } 01461 01462 T_STRING<ST>* f=new T_STRING<ST>[num_vectors]; 01463 int32_t offs=0; 01464 for (int32_t i=0; i<num_vectors; i++) 01465 { 01466 f[i].string=&features[0].string[offs+skip]; 01467 f[i].length=window_size-skip; 01468 offs+=step_size; 01469 } 01470 single_string=features[0].string; 01471 delete[] features; 01472 features=f; 01473 max_string_length=window_size-skip; 01474 01475 return num_vectors; 01476 } 01477 01486 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0) 01487 { 01488 ASSERT(positions); 01489 ASSERT(window_size>0); 01490 ASSERT(num_vectors==1 || single_string); 01491 ASSERT(max_string_length>=window_size || 01492 (single_string && length_of_single_string>=window_size)); 01493 01494 num_vectors= positions->get_num_elements(); 01495 ASSERT(num_vectors>0); 01496 01497 int32_t len; 01498 01499 //in case we are dealing with a single remapped string 01500 //allow remapping 01501 if (single_string) 01502 len=length_of_single_string; 01503 else 01504 { 01505 single_string=features[0].string; 01506 len=max_string_length; 01507 length_of_single_string=max_string_length; 01508 } 01509 01510 T_STRING<ST>* f=new T_STRING<ST>[num_vectors]; 01511 for (int32_t i=0; i<num_vectors; i++) 01512 { 01513 int32_t p=positions->get_element(i); 01514 01515 if (p>=0 && p<=len-window_size) 01516 { 01517 f[i].string=&features[0].string[p+skip]; 01518 f[i].length=window_size-skip; 01519 } 01520 else 01521 { 01522 num_vectors=1; 01523 max_string_length=len; 01524 features[0].length=len; 01525 single_string=NULL; 01526 delete[] f; 01527 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n", 01528 window_size, i, p, len); 01529 return -1; 01530 } 01531 } 01532 01533 delete[] features; 01534 features=f; 01535 max_string_length=window_size-skip; 01536 01537 return num_vectors; 01538 } 01539 01551 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01552 { 01553 return obtain_from_char_features(sf, start, p_order, gap, rev); 01554 } 01555 01565 template <class CT> 01566 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01567 { 01568 ASSERT(sf); 01569 01570 CAlphabet* alpha=sf->get_alphabet(); 01571 ASSERT(alpha->get_num_symbols_in_histogram() > 0); 01572 01573 this->order=p_order; 01574 cleanup(); 01575 01576 num_vectors=sf->get_num_vectors(); 01577 ASSERT(num_vectors>0); 01578 max_string_length=sf->get_max_vector_length()-start; 01579 features=new T_STRING<ST>[num_vectors]; 01580 01581 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(), 01582 alpha->get_num_symbols_in_histogram()); 01583 01584 for (int32_t i=0; i<num_vectors; i++) 01585 { 01586 int32_t len=-1; 01587 bool vfree; 01588 CT* c=sf->get_feature_vector(i, len, vfree); 01589 ASSERT(!vfree); // won't work when preprocessors are attached 01590 01591 features[i].string=new ST[len]; 01592 features[i].length=len; 01593 01594 ST* str=features[i].string; 01595 for (int32_t j=0; j<len; j++) 01596 str[j]=(ST) alpha->remap_to_bin(c[j]); 01597 } 01598 01599 original_num_symbols=alpha->get_num_symbols(); 01600 int32_t max_val=alpha->get_num_bits(); 01601 01602 SG_UNREF(alpha); 01603 01604 if (p_order>1) 01605 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01606 else 01607 num_symbols=original_num_symbols; 01608 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01609 01610 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01611 { 01612 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 01613 return false; 01614 } 01615 01616 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ; 01617 for (int32_t line=0; line<num_vectors; line++) 01618 { 01619 int32_t len=0; 01620 bool vfree; 01621 ST* fv=get_feature_vector(line, len, vfree); 01622 ASSERT(!vfree); // won't work when preprocessors are attached 01623 01624 if (rev) 01625 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap); 01626 else 01627 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap); 01628 01629 /* fix the length of the string -- hacky */ 01630 features[line].length-=start+gap ; 01631 if (features[line].length<0) 01632 features[line].length=0 ; 01633 } 01634 01635 compute_symbol_mask_table(max_val); 01636 01637 return true; 01638 } 01639 01647 bool have_same_length(int32_t len=-1) 01648 { 01649 if (len!=-1) 01650 { 01651 if (len!=get_max_vector_length()) 01652 return false; 01653 } 01654 len = get_max_vector_length(); 01655 01656 for (int32_t i=0; i<num_vectors; i++) 01657 { 01658 if (get_vector_length(i)!=len) 01659 return false; 01660 } 01661 01662 return true; 01663 } 01664 01669 inline void embed_features(int32_t p_order) 01670 { 01671 ASSERT(alphabet->get_num_symbols_in_histogram() > 0); 01672 01673 order=p_order; 01674 original_num_symbols=alphabet->get_num_symbols(); 01675 int32_t max_val=alphabet->get_num_bits(); 01676 01677 if (p_order>1) 01678 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01679 else 01680 num_symbols=original_num_symbols; 01681 01682 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01683 01684 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01685 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 01686 01687 ST mask=0; 01688 for (int32_t i=0; i<p_order*max_val; i++) 01689 mask= (mask<<1) | ((ST) 1); 01690 01691 for (int32_t i=0; i<num_vectors; i++) 01692 { 01693 int32_t len=features[i].length; 01694 01695 if (len < p_order) 01696 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order); 01697 01698 ST* str = features[i].string; 01699 01700 // convert first word 01701 for (int32_t j=0; j<p_order; j++) 01702 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01703 str[0]=embed_word(&str[0], p_order); 01704 01705 // convert the rest 01706 int32_t idx=0; 01707 for (int32_t j=p_order; j<len; j++) 01708 { 01709 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01710 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask; 01711 idx++; 01712 } 01713 01714 features[i].length=len-p_order+1; 01715 } 01716 01717 compute_symbol_mask_table(max_val); 01718 } 01719 01724 inline void compute_symbol_mask_table(int64_t max_val) 01725 { 01726 delete[] symbol_mask_table; 01727 symbol_mask_table=new ST[256]; 01728 01729 uint64_t mask=0; 01730 for (int32_t i=0; i< (int64_t) max_val; i++) 01731 mask=(mask<<1) | 1; 01732 01733 for (int32_t i=0; i<256; i++) 01734 { 01735 uint8_t bits=(uint8_t) i; 01736 symbol_mask_table[i]=0; 01737 01738 for (int32_t j=0; j<8; j++) 01739 { 01740 if (bits & 1) 01741 symbol_mask_table[i]|=mask<<(max_val*j); 01742 01743 bits>>=1; 01744 } 01745 } 01746 } 01747 01754 inline void unembed_word(ST word, uint8_t* seq, int32_t len) 01755 { 01756 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01757 01758 ST mask=0; 01759 for (int32_t i=0; i<nbits; i++) 01760 mask=(mask<<1) | (ST) 1; 01761 01762 for (int32_t i=0; i<len; i++) 01763 { 01764 ST w=(word & mask); 01765 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w); 01766 word>>=nbits; 01767 } 01768 } 01769 01775 inline ST embed_word(ST* seq, int32_t len) 01776 { 01777 ST value=(ST) 0; 01778 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01779 for (int32_t i=0; i<len; i++) 01780 { 01781 value<<=nbits; 01782 value|=seq[i]; 01783 } 01784 01785 return value; 01786 } 01787 01790 void determine_maximum_string_length() 01791 { 01792 max_string_length=0; 01793 01794 for (int32_t i=0; i<num_vectors; i++) 01795 max_string_length=CMath::max(max_string_length, features[i].length); 01796 } 01797 01805 static ST* get_zero_terminated_string_copy(T_STRING<ST> str) 01806 { 01807 int32_t l=str.length; 01808 ST* s=new ST[l+1]; 01809 memcpy(s, str.string, sizeof(ST)*l); 01810 s[l]='\0'; 01811 return s; 01812 } 01813 01820 virtual void set_feature_vector(int32_t num, ST* string, int32_t len) 01821 { 01822 ASSERT(features); 01823 ASSERT(num<num_vectors); 01824 01825 features[num].length=len ; 01826 features[num].string=string ; 01827 01828 max_string_length=CMath::max(len, max_string_length); 01829 } 01830 01831 01834 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true) 01835 { 01836 int32_t nsym=get_num_symbols(); 01837 int32_t slen=get_max_vector_length(); 01838 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t); 01839 float64_t* h= (float64_t*) malloc(sz); 01840 ASSERT(h); 01841 memset(h, 0, sz); 01842 01843 float64_t* h_normalizer=new float64_t[slen]; 01844 memset(h_normalizer, 0, slen*sizeof(float64_t)); 01845 int32_t num_str=get_num_vectors(); 01846 for (int32_t i=0; i<num_str; i++) 01847 { 01848 int32_t len; 01849 bool free_vec; 01850 ST* vec=get_feature_vector(i, len, free_vec); 01851 for (int32_t j=0; j<len; j++) 01852 { 01853 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++; 01854 h_normalizer[j]++; 01855 } 01856 free_feature_vector(vec, i, free_vec); 01857 } 01858 01859 if (normalize) 01860 { 01861 for (int32_t i=0; i<slen; i++) 01862 { 01863 for (int32_t j=0; j<nsym; j++) 01864 { 01865 if (h_normalizer && h_normalizer[i]) 01866 h[int64_t(i)*nsym+j]/=h_normalizer[i]; 01867 } 01868 } 01869 } 01870 delete[] h_normalizer; 01871 01872 *hist=h; 01873 *rows=nsym; 01874 *cols=slen; 01875 } 01876 01879 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec) 01880 { 01881 ASSERT(rows == get_num_symbols()); 01882 cleanup(); 01883 float64_t* randoms=new float64_t[cols]; 01884 T_STRING<ST>* sf=new T_STRING<ST>[num_vec]; 01885 01886 for (int32_t i=0; i<num_vec; i++) 01887 { 01888 sf[i].string=new ST[cols]; 01889 sf[i].length=cols; 01890 01891 CMath::random_vector(randoms, cols, 0.0, 1.0); 01892 01893 for (int32_t j=0; j<cols; j++) 01894 { 01895 float64_t lik=hist[int64_t(j)*rows+0]; 01896 01897 int32_t c; 01898 for (c=0; c<rows-1; c++) 01899 { 01900 if (randoms[j]<=lik) 01901 break; 01902 lik+=hist[int64_t(j)*rows+c+1]; 01903 } 01904 sf[i].string[j]=alphabet->remap_to_char(c); 01905 } 01906 } 01907 delete[] randoms; 01908 set_features(sf, num_vec, cols); 01909 } 01910 01912 inline virtual const char* get_name() const { return "StringFeatures"; } 01913 01914 protected: 01915 01926 virtual ST* compute_feature_vector(int32_t num, int32_t& len) 01927 { 01928 ASSERT(features && num<num_vectors); 01929 01930 len=features[num].length; 01931 if (len<=0) 01932 return NULL; 01933 01934 ST* target=new ST[len]; 01935 memcpy(target, features[num].string, len*sizeof(ST)); 01936 return target; 01937 } 01938 01939 #ifdef HAVE_BOOST_SERIALIZATION 01940 private: 01941 01942 friend class ::boost::serialization::access; 01943 template<class Archive> 01944 void save(Archive & ar, const unsigned int archive_version) const 01945 { 01946 01947 SG_DEBUG("archiving StringFeatures\n"); 01948 01949 ar & ::boost::serialization::base_object<CFeatures>(*this); 01950 01951 ar & alphabet; 01952 01953 ar & num_vectors; 01954 for (int i=0; i < num_vectors; ++i) { 01955 ar & features[i]; 01956 } 01957 01958 ar & length_of_single_string; 01959 for (int i=0; i < length_of_single_string; ++i) { 01960 ar & single_string[i]; 01961 } 01962 01963 ar & max_string_length; 01964 ar & num_symbols; 01965 ar & original_num_symbols; 01966 ar & order; 01967 01969 //TODO?! how long 01970 //ST* symbol_mask_table; 01971 01972 SG_DEBUG("done archiving StringFeatures\n"); 01973 01974 } 01975 01976 template<class Archive> 01977 void load(Archive & ar, const unsigned int archive_version) 01978 { 01979 01980 SG_DEBUG("archiving StringFeatures\n"); 01981 01982 ar & ::boost::serialization::base_object<CFeatures>(*this); 01983 01984 01985 ar & alphabet; 01986 01987 ar & num_vectors; 01988 01989 //T_STRING<ST>* features = new T_STRING<ST>[num_vectors]; 01990 features = new T_STRING<ST>[num_vectors]; 01991 for (int i=0; i < num_vectors; ++i) { 01992 ar & features[i]; 01993 } 01994 01995 01996 ar & length_of_single_string; 01997 01998 //ST* single_string = new ST[length_of_single_string]; 01999 single_string = new ST[length_of_single_string]; 02000 for (int i=0; i < length_of_single_string; ++i) { 02001 ar & single_string[i]; 02002 } 02003 02004 ar & max_string_length; 02005 ar & num_symbols; 02006 ar & original_num_symbols; 02007 ar & order; 02008 02010 //TODO?! how long -> num_of_symbols? 02011 //ST* symbol_mask_table; 02012 02013 SG_DEBUG("done archiving StringFeatures\n"); 02014 02015 } 02016 02017 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER(); 02018 02019 02020 #endif //HAVE_BOOST_SERIALIZATION 02021 02022 02023 protected: 02024 02026 CAlphabet* alphabet; 02027 02029 int32_t num_vectors; 02030 02032 T_STRING<ST>* features; 02033 02035 ST* single_string; 02036 02038 int32_t length_of_single_string; 02039 02041 int32_t max_string_length; 02042 02044 floatmax_t num_symbols; 02045 02047 floatmax_t original_num_symbols; 02048 02050 int32_t order; 02051 02053 ST* symbol_mask_table; 02054 02056 bool preprocess_on_get; 02057 02059 CCache<ST>* feature_cache; 02060 }; 02061 02062 #ifndef DOXYGEN_SHOULD_SKIP_THIS 02063 02067 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type() 02068 { 02069 return F_BOOL; 02070 } 02071 02076 template<> inline EFeatureType CStringFeatures<char>::get_feature_type() 02077 { 02078 return F_CHAR; 02079 } 02080 02085 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type() 02086 { 02087 return F_BYTE; 02088 } 02089 02094 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type() 02095 { 02096 return F_SHORT; 02097 } 02098 02103 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type() 02104 { 02105 return F_WORD; 02106 } 02107 02112 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type() 02113 { 02114 return F_INT; 02115 } 02116 02121 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type() 02122 { 02123 return F_UINT; 02124 } 02125 02130 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type() 02131 { 02132 return F_LONG; 02133 } 02134 02139 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type() 02140 { 02141 return F_ULONG; 02142 } 02143 02148 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type() 02149 { 02150 return F_SHORTREAL; 02151 } 02152 02157 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type() 02158 { 02159 return F_DREAL; 02160 } 02161 02166 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type() 02167 { 02168 return F_LONGREAL; 02169 } 02170 02171 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask) 02172 { 02173 return symbol; 02174 } 02175 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask) 02176 { 02177 return symbol; 02178 } 02179 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask) 02180 { 02181 return symbol; 02182 } 02183 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask) 02184 { 02185 return symbol; 02186 } 02187 02188 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount) 02189 { 02190 return false; 02191 } 02192 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount) 02193 { 02194 return 0; 02195 } 02196 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount) 02197 { 02198 return 0; 02199 } 02200 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount) 02201 { 02202 return 0; 02203 } 02204 02205 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount) 02206 { 02207 return symbol; 02208 } 02209 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount) 02210 { 02211 return symbol; 02212 } 02213 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount) 02214 { 02215 return symbol; 02216 } 02217 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount) 02218 { 02219 return symbol; 02220 } 02221 02222 #ifndef SUNOS 02223 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 02224 { 02225 return false; 02226 } 02227 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 02228 { 02229 return false; 02230 } 02231 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 02232 { 02233 return false; 02234 } 02235 #endif 02236 02237 template<> inline void CStringFeatures<float32_t>::embed_features(int32_t p_order) 02238 { 02239 } 02240 template<> inline void CStringFeatures<float64_t>::embed_features(int32_t p_order) 02241 { 02242 } 02243 template<> inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order) 02244 { 02245 } 02246 02247 template<> inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val) 02248 { 02249 } 02250 template<> inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val) 02251 { 02252 } 02253 template<> inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val) 02254 { 02255 } 02256 02257 template<> inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len) 02258 { 02259 return 0; 02260 } 02261 template<> inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len) 02262 { 02263 return 0; 02264 } 02265 template<> inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len) 02266 { 02267 return 0; 02268 } 02269 02270 template<> inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len) 02271 { 02272 } 02273 template<> inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len) 02274 { 02275 } 02276 template<> inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len) 02277 { 02278 } 02279 #define LOAD(f_load, sg_type) \ 02280 template<> inline void CStringFeatures<sg_type>::load(CFile* loader) \ 02281 { \ 02282 SG_INFO( "loading...\n"); \ 02283 \ 02284 T_STRING<sg_type>* strs; \ 02285 int32_t num_str; \ 02286 int32_t max_len; \ 02287 loader->f_load(strs, num_str, max_len); \ 02288 set_features(strs, num_str, max_len); \ 02289 } 02290 02291 LOAD(get_bool_string_list, bool) 02292 LOAD(get_char_string_list, char) 02293 LOAD(get_byte_string_list, uint8_t) 02294 LOAD(get_short_string_list, int16_t) 02295 LOAD(get_word_string_list, uint16_t) 02296 LOAD(get_int_string_list, int32_t) 02297 LOAD(get_uint_string_list, uint32_t) 02298 LOAD(get_long_string_list, int64_t) 02299 LOAD(get_ulong_string_list, uint64_t) 02300 LOAD(get_shortreal_string_list, float32_t) 02301 LOAD(get_real_string_list, float64_t) 02302 LOAD(get_longreal_string_list, floatmax_t) 02303 #undef LOAD 02304 02305 #define SAVE(f_write, sg_type) \ 02306 template<> inline void CStringFeatures<sg_type>::save(CFile* writer) \ 02307 { \ 02308 ASSERT(writer); \ 02309 writer->f_write(features, num_vectors); \ 02310 } 02311 02312 SAVE(set_bool_string_list, bool) 02313 SAVE(set_char_string_list, char) 02314 SAVE(set_byte_string_list, uint8_t) 02315 SAVE(set_short_string_list, int16_t) 02316 SAVE(set_word_string_list, uint16_t) 02317 SAVE(set_int_string_list, int32_t) 02318 SAVE(set_uint_string_list, uint32_t) 02319 SAVE(set_long_string_list, int64_t) 02320 SAVE(set_ulong_string_list, uint64_t) 02321 SAVE(set_shortreal_string_list, float32_t) 02322 SAVE(set_real_string_list, float64_t) 02323 SAVE(set_longreal_string_list, floatmax_t) 02324 #undef SAVE 02325 #endif // DOXYGEN_SHOULD_SKIP_THIS 02326 } 02327 #endif // _CSTRINGFEATURES__H__