|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max Planck Society 00009 */ 00010 #ifndef __BITSTRING_H__ 00011 #define __BITSTRING_H__ 00012 00013 #include <shogun/features/Alphabet.h> 00014 #include <shogun/lib/common.h> 00015 #include <shogun/lib/io.h> 00016 #include <shogun/lib/MemoryMappedFile.h> 00017 #include <shogun/lib/Mathematics.h> 00018 00019 namespace shogun 00020 { 00028 class CBitString : public CSGObject 00029 { 00030 public: 00038 CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0) 00039 { 00040 alphabet=new CAlphabet(alpha); 00041 int32_t nbits=alphabet->get_num_bits(); 00042 word_len = width*nbits; 00043 00044 mask=0; 00045 for (int32_t j=0; j<word_len; j++) 00046 mask=(mask<<1) | (uint64_t) 1; 00047 mask<<=sizeof(uint64_t)*8-word_len; 00048 00049 single_mask=0; 00050 for (int32_t j=0; j<nbits; j++) 00051 mask=(mask<<1) | (uint64_t) 1; 00052 } 00053 00055 ~CBitString() 00056 { 00057 cleanup(); 00058 SG_UNREF(alphabet); 00059 } 00060 00062 void cleanup() 00063 { 00064 delete[] string; 00065 string=NULL; 00066 length=0; 00067 } 00068 00074 void obtain_from_char(char* str, uint64_t len) 00075 { 00076 cleanup(); 00077 uint64_t stream_len=len/sizeof(uint64_t)+1; 00078 string=new uint64_t[stream_len]; 00079 length=len; 00080 00081 uint64_t w=0; 00082 int32_t nbits=alphabet->get_num_bits(); 00083 uint64_t j=0; 00084 int32_t nfit=8*sizeof(w)/nbits; 00085 for (uint64_t i=0; i<len; i++) 00086 { 00087 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]); 00088 00089 if (i % nfit == nfit-1) 00090 { 00091 string[j]=w; 00092 j++; 00093 } 00094 } 00095 00096 if (j<stream_len) 00097 { 00098 string[j]=w; 00099 j++; 00100 } 00101 00102 ASSERT(j==stream_len); 00103 } 00104 00111 void load_fasta_file(const char* fname, bool ignore_invalid=false) 00112 { 00113 cleanup(); 00114 00115 uint64_t len=0; 00116 uint64_t offs=0; 00117 00118 CMemoryMappedFile<char> f(fname); 00119 00120 uint64_t id_len=0; 00121 char* id=f.get_line(id_len, offs); 00122 00123 if (!id_len || id[0]!='>') 00124 SG_SERROR("No fasta hunks (lines starting with '>') found\n"); 00125 00126 if (offs==f.get_size()) 00127 SG_SERROR("Empty file?\n"); 00128 00129 char* fasta=NULL; 00130 char* s=NULL; 00131 int32_t spanned_lines=0; 00132 int32_t fasta_len=0; 00133 00134 while (true) 00135 { 00136 s=f.get_line(len, offs); 00137 00138 if (!fasta) 00139 fasta=s; 00140 00141 if (!s || len==0) 00142 SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len); 00143 00144 if (s[0]=='>') 00145 SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n"); 00146 00147 if (offs==f.get_size()) 00148 { 00149 offs-=len+1; // seek to beginning 00150 fasta_len+=len; 00151 00152 uint64_t w=0; 00153 int32_t nbits=alphabet->get_num_bits(); 00154 uint64_t nfit=8*sizeof(w)/nbits; 00155 00156 len = fasta_len-spanned_lines; 00157 uint64_t stream_len=len/(nfit)+1; 00158 string=new uint64_t[stream_len]; 00159 length=len; 00160 00161 int32_t idx=0; 00162 int32_t k=0; 00163 00164 for (int32_t j=0; j<fasta_len; j++, k++) 00165 { 00166 if (fasta[j]=='\n') 00167 { 00168 k--; 00169 continue; 00170 } 00171 00172 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]); 00173 00174 if (k % nfit == nfit-1) 00175 { 00176 string[idx]=w; 00177 w=0; 00178 idx++; 00179 } 00180 } 00181 00182 if (idx<stream_len) 00183 string[idx]=w<<(nbits*(nfit - k%nfit)); 00184 00185 break; 00186 } 00187 00188 spanned_lines++; 00189 fasta_len+=len+1; // including '\n' 00190 } 00191 } 00192 00198 void set_string(uint64_t* str, uint64_t len) 00199 { 00200 cleanup(); 00201 string=str; 00202 length=len; 00203 } 00204 00209 void create(uint64_t len) 00210 { 00211 cleanup(); 00212 uint64_t stream_len=len/sizeof(uint64_t)+1; 00213 string=new uint64_t[stream_len]; 00214 CMath::fill_vector(string, (int32_t) stream_len, (uint64_t) 0); 00215 length=len; 00216 } 00217 00218 /* 00219 inline uint64_t condense(uint64_t bits, uint64_t mask) const 00220 { 00221 uint64_t res = 0 ; 00222 uint64_t m=mask ; 00223 uint64_t b=bits ; 00224 char cnt=0 ; 00225 00226 char ar[256][256] ; 00227 00228 for (int i=0; i<8; i++) 00229 { 00230 //fprintf(stdout, "%i %lx %lx %lx %i\n", i, res, m, b, (int)cnt) ; 00231 if (m&1) 00232 res = res>>8 | ar[b&255][m&255] ; 00233 //else 00234 // cnt++ ; 00235 m=m>>8 ; 00236 b=b>>8 ; 00237 } 00238 res=res>>cnt ; 00239 //fprintf(stdout, "%lx %lx %lx\n", res, m, b) ; 00240 00241 //res = res & bits & mask ; 00242 00243 return res ; 00244 } 00245 */ 00246 00251 inline uint64_t operator[](uint64_t index) const 00252 { 00253 ASSERT(index<length); 00254 00255 uint64_t bitindex=alphabet->get_num_bits()*index; 00256 int32_t ws=8*sizeof(uint64_t); 00257 uint64_t i=bitindex/ws; 00258 int32_t j=bitindex % ws; 00259 int32_t missing=word_len-(ws-j); 00260 00261 //SG_SPRINT("i=%lld j=%d ws=%d word_len=%d missing=%d left=%llx shift=%d\n", i, j, ws, word_len, missing, ( string[i] << j ) & mask, ws-word_len); 00262 uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len); 00263 00264 if (missing>0) 00265 res|= ( string[i+1] >> (ws-missing) ); 00266 00267 //res = condense(res, 1<<31|1<<24|1<<10|1<<8|1<<4|1<<2|1<<1|1) ; 00268 00269 return res; 00270 } 00271 00277 inline void set_binary_word(uint16_t word, uint64_t index) 00278 { 00279 ASSERT(index<length); 00280 00281 uint64_t bitindex=alphabet->get_num_bits()*index; 00282 int32_t ws=8*sizeof(uint64_t); 00283 uint64_t i=bitindex/ws; 00284 int32_t j=bitindex % ws; 00285 int32_t missing=word_len-(ws-j); 00286 00287 uint64_t sl = j-word_len; 00288 uint64_t ml; 00289 uint64_t wl; 00290 uint64_t mr; 00291 uint64_t wr; 00292 00293 if (sl>0) 00294 { 00295 ml=mask<<sl; 00296 wl=word<<sl ; 00297 } 00298 else 00299 { 00300 sl=-sl; 00301 ml=mask>>sl; 00302 wl=word>>sl; 00303 } 00304 00305 string[i] = ( string[i] & (~ml) ) | ( wl & ml); 00306 00307 if (missing>0) 00308 { 00309 mr = mask<<missing; 00310 wr = word<<missing; 00311 string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr); 00312 } 00313 00314 } 00315 00317 inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; } 00318 00320 inline virtual const char* get_name() const { return "BitString"; } 00321 00322 private: 00324 CAlphabet* alphabet; 00326 uint64_t* string; 00328 uint64_t length; 00330 int32_t word_len; 00332 uint64_t mask; 00334 uint64_t single_mask; 00335 }; 00336 } 00337 #endif //__BITSTRING_H__