|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2010 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 * Copyright (C) 2010 Berlin Institute of Technology 00011 */ 00012 00013 #ifndef _SPARSEFEATURES__H__ 00014 #define _SPARSEFEATURES__H__ 00015 00016 #include <string.h> 00017 #include <stdlib.h> 00018 00019 #include "lib/common.h" 00020 #include "lib/Mathematics.h" 00021 #include "lib/Cache.h" 00022 #include "lib/io.h" 00023 #include "lib/Cache.h" 00024 #include "lib/File.h" 00025 00026 #include "features/Labels.h" 00027 #include "features/Features.h" 00028 #include "features/DotFeatures.h" 00029 #include "features/SimpleFeatures.h" 00030 #include "preproc/SparsePreProc.h" 00031 00032 namespace shogun 00033 { 00034 00035 class CFile; 00036 class CLabels; 00037 class CFeatures; 00038 class CDotFeatures; 00039 template <class ST> class CSimpleFeatures; 00040 template <class ST> class CSparsePreProc; 00041 00042 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00043 00044 template <class ST> struct TSparseEntry 00045 { 00047 int32_t feat_index; 00049 ST entry; 00050 }; 00051 00053 template <class ST> struct TSparse 00054 { 00055 public: 00057 int32_t vec_index; 00059 int32_t num_feat_entries; 00061 TSparseEntry<ST>* features; 00062 }; 00063 #endif // DOXYGEN_SHOULD_SKIP_THIS 00064 00077 template <class ST> class CSparseFeatures : public CDotFeatures 00078 { 00079 public: 00084 CSparseFeatures(int32_t size=0) 00085 : CDotFeatures(size), num_vectors(0), num_features(0), 00086 sparse_feature_matrix(NULL), feature_cache(NULL) 00087 {} 00088 00097 CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false) 00098 : CDotFeatures(0), num_vectors(0), num_features(0), 00099 sparse_feature_matrix(NULL), feature_cache(NULL) 00100 { 00101 if (!copy) 00102 set_sparse_feature_matrix(src, num_feat, num_vec); 00103 else 00104 { 00105 sparse_feature_matrix = new TSparse<ST>[num_vec]; 00106 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec); 00107 for (int32_t i=0; i< num_vec; i++) 00108 { 00109 sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries]; 00110 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries); 00111 00112 } 00113 } 00114 } 00115 00123 CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec) 00124 : CDotFeatures(0), num_vectors(0), num_features(0), 00125 sparse_feature_matrix(NULL), feature_cache(NULL) 00126 { 00127 set_full_feature_matrix(src, num_feat, num_vec); 00128 } 00129 00131 CSparseFeatures(const CSparseFeatures & orig) 00132 : CDotFeatures(orig), num_vectors(orig.num_vectors), 00133 num_features(orig.num_features), 00134 sparse_feature_matrix(orig.sparse_feature_matrix), 00135 feature_cache(orig.feature_cache) 00136 { 00137 if (orig.sparse_feature_matrix) 00138 { 00139 free_sparse_feature_matrix(); 00140 sparse_feature_matrix=new TSparse<ST>[num_vectors]; 00141 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors); 00142 for (int32_t i=0; i< num_vectors; i++) 00143 { 00144 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries]; 00145 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries); 00146 00147 } 00148 } 00149 } 00150 00155 CSparseFeatures(CFile* loader) 00156 : CDotFeatures(loader), num_vectors(0), num_features(0), 00157 sparse_feature_matrix(NULL), feature_cache(NULL) 00158 { 00159 load(loader); 00160 } 00161 00163 virtual ~CSparseFeatures() 00164 { 00165 free_sparse_features(); 00166 } 00167 00171 void free_sparse_feature_matrix() 00172 { 00173 clean_tsparse(sparse_feature_matrix, num_vectors); 00174 sparse_feature_matrix = NULL; 00175 num_vectors=0; 00176 num_features=0; 00177 } 00178 00182 void free_sparse_features() 00183 { 00184 free_sparse_feature_matrix(); 00185 delete feature_cache; 00186 feature_cache = NULL; 00187 } 00188 00193 virtual CFeatures* duplicate() const 00194 { 00195 return new CSparseFeatures<ST>(*this); 00196 } 00197 00205 ST get_feature(int32_t num, int32_t index) 00206 { 00207 ASSERT(index>=0 && index<num_features) ; 00208 ASSERT(num>=0 && num<num_vectors) ; 00209 00210 bool vfree; 00211 int32_t num_feat; 00212 int32_t i; 00213 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00214 ST ret = 0 ; 00215 00216 if (sv) 00217 { 00218 for (i=0; i<num_feat; i++) 00219 if (sv[i].feat_index==index) 00220 ret += sv[i].entry ; 00221 } 00222 00223 free_sparse_feature_vector(sv, num, vfree); 00224 00225 return ret ; 00226 } 00227 00228 00237 ST* get_full_feature_vector(int32_t num, int32_t& len) 00238 { 00239 bool vfree; 00240 int32_t num_feat; 00241 int32_t i; 00242 len=0; 00243 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00244 ST* fv=NULL; 00245 00246 if (sv) 00247 { 00248 len=num_features; 00249 fv=new ST[num_features]; 00250 00251 for (i=0; i<num_features; i++) 00252 fv[i]=0; 00253 00254 for (i=0; i<num_feat; i++) 00255 fv[sv[i].feat_index]= sv[i].entry; 00256 } 00257 00258 free_sparse_feature_vector(sv, num, vfree); 00259 00260 return fv; 00261 } 00262 00269 void get_full_feature_vector(ST** dst, int32_t* len, int32_t num) 00270 { 00271 if (num>=num_vectors) 00272 { 00273 SG_ERROR("Index out of bounds (number of vectors %d, you " 00274 "requested %d)\n", num_vectors, num); 00275 } 00276 00277 bool vfree; 00278 int32_t num_feat=0; 00279 *len=0; 00280 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00281 00282 if (sv) 00283 { 00284 *len=num_features; 00285 *dst= (ST*) malloc(sizeof(ST)*num_features); 00286 memset(*dst, 0, sizeof(ST)*num_features); 00287 00288 for (int32_t i=0; i<num_feat; i++) 00289 (*dst)[sv[i].feat_index]= sv[i].entry; 00290 } 00291 00292 free_sparse_feature_vector(sv, num, vfree); 00293 } 00294 00295 00301 virtual inline int32_t get_nnz_features_for_vector(int32_t num) 00302 { 00303 bool vfree; 00304 int32_t len; 00305 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree); 00306 free_sparse_feature_vector(sv, num, vfree); 00307 return len; 00308 } 00309 00320 TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree) 00321 { 00322 ASSERT(num<num_vectors); 00323 00324 if (sparse_feature_matrix) 00325 { 00326 len= sparse_feature_matrix[num].num_feat_entries; 00327 vfree=false ; 00328 return sparse_feature_matrix[num].features; 00329 } 00330 else 00331 { 00332 TSparseEntry<ST>* feat=NULL; 00333 vfree=false; 00334 00335 if (feature_cache) 00336 { 00337 feat=feature_cache->lock_entry(num); 00338 00339 if (feat) 00340 return feat; 00341 else 00342 { 00343 feat=feature_cache->set_entry(num); 00344 } 00345 } 00346 00347 if (!feat) 00348 vfree=true; 00349 00350 feat=compute_sparse_feature_vector(num, len, feat); 00351 00352 00353 if (get_num_preproc()) 00354 { 00355 int32_t tmp_len=len; 00356 TSparseEntry<ST>* tmp_feat_before = feat; 00357 TSparseEntry<ST>* tmp_feat_after = NULL; 00358 00359 for (int32_t i=0; i<get_num_preproc(); i++) 00360 { 00361 //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len); 00362 00363 if (i!=0) // delete feature vector, except for the the first one, i.e., feat 00364 delete[] tmp_feat_before; 00365 tmp_feat_before=tmp_feat_after; 00366 } 00367 00368 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len); 00369 delete[] tmp_feat_after; 00370 len=tmp_len ; 00371 SG_DEBUG( "len: %d len2: %d\n", len, num_features); 00372 } 00373 return feat ; 00374 } 00375 } 00376 00377 00388 static ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen) 00389 { 00390 ST result=0; 00391 00392 //result remains zero when one of the vectors is non existent 00393 if (avec && bvec) 00394 { 00395 if (alen<=blen) 00396 { 00397 int32_t j=0; 00398 for (int32_t i=0; i<alen; i++) 00399 { 00400 int32_t a_feat_idx=avec[i].feat_index; 00401 00402 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) ) 00403 j++; 00404 00405 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) ) 00406 { 00407 result+= avec[i].entry * bvec[j].entry; 00408 j++; 00409 } 00410 } 00411 } 00412 else 00413 { 00414 int32_t j=0; 00415 for (int32_t i=0; i<blen; i++) 00416 { 00417 int32_t b_feat_idx=bvec[i].feat_index; 00418 00419 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) ) 00420 j++; 00421 00422 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) ) 00423 { 00424 result+= bvec[i].entry * avec[j].entry; 00425 j++; 00426 } 00427 } 00428 } 00429 00430 result*=alpha; 00431 } 00432 00433 return result; 00434 } 00435 00446 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b) 00447 { 00448 ASSERT(vec); 00449 ASSERT(dim==num_features); 00450 ST result=b; 00451 00452 bool vfree; 00453 int32_t num_feat; 00454 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00455 00456 if (sv) 00457 { 00458 for (int32_t i=0; i<num_feat; i++) 00459 result+=alpha*vec[sv[i].feat_index]*sv[i].entry; 00460 } 00461 00462 free_sparse_feature_vector(sv, num, vfree); 00463 return result; 00464 } 00465 00475 void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false) 00476 { 00477 ASSERT(vec); 00478 if (dim!=num_features) 00479 { 00480 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00481 dim, num_features); 00482 } 00483 00484 bool vfree; 00485 int32_t num_feat; 00486 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00487 00488 if (sv) 00489 { 00490 if (abs_val) 00491 { 00492 for (int32_t i=0; i<num_feat; i++) 00493 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00494 } 00495 else 00496 { 00497 for (int32_t i=0; i<num_feat; i++) 00498 vec[sv[i].feat_index]+= alpha*sv[i].entry; 00499 } 00500 } 00501 00502 free_sparse_feature_vector(sv, num, vfree); 00503 } 00504 00511 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free) 00512 { 00513 if (feature_cache) 00514 feature_cache->unlock_entry(num); 00515 00516 if (free) 00517 delete[] feat_vec ; 00518 } 00519 00527 TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec) 00528 { 00529 num_feat=num_features; 00530 num_vec=num_vectors; 00531 00532 return sparse_feature_matrix; 00533 } 00534 00543 void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat, 00544 int32_t* num_vec, int64_t* nnz) 00545 { 00546 *nnz=get_num_nonzero_entries(); 00547 *num_feat=num_features; 00548 *num_vec=num_vectors; 00549 *dst=sparse_feature_matrix; 00550 } 00551 00557 void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec) 00558 { 00559 if (sfm) 00560 { 00561 for (int32_t i=0; i<num_vec; i++) 00562 delete[] sfm[i].features; 00563 00564 delete[] sfm; 00565 } 00566 } 00567 00572 CSparseFeatures<ST>* get_transposed() 00573 { 00574 int32_t num_feat; 00575 int32_t num_vec; 00576 TSparse<ST>* s=get_transposed(num_feat, num_vec); 00577 return new CSparseFeatures<ST>(s, num_feat, num_vec); 00578 } 00579 00589 TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec) 00590 { 00591 num_feat=num_vectors; 00592 num_vec=num_features; 00593 00594 int32_t* hist=new int32_t[num_features]; 00595 memset(hist, 0, sizeof(int32_t)*num_features); 00596 00597 // count how lengths of future feature vectors 00598 for (int32_t v=0; v<num_vectors; v++) 00599 { 00600 int32_t vlen; 00601 bool vfree; 00602 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree); 00603 00604 for (int32_t i=0; i<vlen; i++) 00605 hist[sv[i].feat_index]++; 00606 00607 free_sparse_feature_vector(sv, v, vfree); 00608 } 00609 00610 // allocate room for future feature vectors 00611 TSparse<ST>* sfm=new TSparse<ST>[num_vec]; 00612 for (int32_t v=0; v<num_vec; v++) 00613 { 00614 sfm[v].features= new TSparseEntry<ST>[hist[v]]; 00615 sfm[v].num_feat_entries=hist[v]; 00616 sfm[v].vec_index=v; 00617 } 00618 00619 // fill future feature vectors with content 00620 memset(hist,0,sizeof(int32_t)*num_features); 00621 for (int32_t v=0; v<num_vectors; v++) 00622 { 00623 int32_t vlen; 00624 bool vfree; 00625 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree); 00626 00627 for (int32_t i=0; i<vlen; i++) 00628 { 00629 int32_t vidx=sv[i].feat_index; 00630 int32_t fidx=v; 00631 sfm[vidx].features[hist[vidx]].feat_index=fidx; 00632 sfm[vidx].features[hist[vidx]].entry=sv[i].entry; 00633 hist[vidx]++; 00634 } 00635 00636 free_sparse_feature_vector(sv, v, vfree); 00637 } 00638 00639 delete[] hist; 00640 return sfm; 00641 } 00642 00652 virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec) 00653 { 00654 free_sparse_feature_matrix(); 00655 00656 sparse_feature_matrix=src; 00657 num_features=num_feat; 00658 num_vectors=num_vec; 00659 } 00660 00668 ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec) 00669 { 00670 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features); 00671 num_feat=num_features; 00672 num_vec=num_vectors; 00673 00674 ST* fm=new ST[num_feat*num_vec]; 00675 00676 if (fm) 00677 { 00678 for (int64_t i=0; i<num_feat*num_vec; i++) 00679 fm[i]=0; 00680 00681 for (int32_t v=0; v<num_vec; v++) 00682 { 00683 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++) 00684 { 00685 int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index; 00686 fm[offs]= sparse_feature_matrix[v].features[f].entry; 00687 } 00688 } 00689 } 00690 else 00691 SG_ERROR( "error allocating memory for dense feature matrix\n"); 00692 00693 return fm; 00694 } 00695 00703 void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec) 00704 { 00705 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features); 00706 *num_feat=num_features; 00707 *num_vec=num_vectors; 00708 00709 *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors); 00710 00711 if (*dst) 00712 { 00713 for (int64_t i=0; i<num_features*num_vectors; i++) 00714 (*dst)[i]=0; 00715 00716 for (int32_t v=0; v<num_vectors; v++) 00717 { 00718 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++) 00719 { 00720 int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index; 00721 (*dst)[offs]= sparse_feature_matrix[v].features[f].entry; 00722 } 00723 } 00724 } 00725 else 00726 SG_ERROR( "error allocating memory for dense feature matrix\n"); 00727 } 00728 00738 virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec) 00739 { 00740 free_sparse_feature_matrix(); 00741 bool result=true; 00742 num_features=num_feat; 00743 num_vectors=num_vec; 00744 00745 SG_INFO("converting dense feature matrix to sparse one\n"); 00746 int32_t* num_feat_entries=new int[num_vectors]; 00747 00748 if (num_feat_entries) 00749 { 00750 int32_t num_total_entries=0; 00751 00752 // count nr of non sparse features 00753 for (int32_t i=0; i< num_vec; i++) 00754 { 00755 num_feat_entries[i]=0; 00756 for (int32_t j=0; j< num_feat; j++) 00757 { 00758 if (src[i*((int64_t) num_feat) + j] != 0) 00759 num_feat_entries[i]++; 00760 } 00761 } 00762 00763 if (num_vec>0) 00764 { 00765 sparse_feature_matrix=new TSparse<ST>[num_vec]; 00766 00767 if (sparse_feature_matrix) 00768 { 00769 for (int32_t i=0; i< num_vec; i++) 00770 { 00771 sparse_feature_matrix[i].vec_index=i; 00772 sparse_feature_matrix[i].num_feat_entries=0; 00773 sparse_feature_matrix[i].features= NULL; 00774 00775 if (num_feat_entries[i]>0) 00776 { 00777 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]]; 00778 00779 if (!sparse_feature_matrix[i].features) 00780 { 00781 SG_INFO( "allocation of features failed\n"); 00782 return false; 00783 } 00784 00785 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i]; 00786 int32_t sparse_feat_idx=0; 00787 00788 for (int32_t j=0; j< num_feat; j++) 00789 { 00790 int64_t pos= i*num_feat + j; 00791 00792 if (src[pos] != 0) 00793 { 00794 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos]; 00795 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j; 00796 sparse_feat_idx++; 00797 num_total_entries++; 00798 } 00799 } 00800 } 00801 } 00802 } 00803 else 00804 { 00805 SG_ERROR( "allocation of sparse feature matrix failed\n"); 00806 result=false; 00807 } 00808 00809 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n", 00810 num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec)); 00811 } 00812 else 00813 { 00814 SG_ERROR( "huh ? zero size matrix given ?\n"); 00815 result=false; 00816 } 00817 } 00818 delete[] num_feat_entries; 00819 return result; 00820 } 00821 00827 virtual bool apply_preproc(bool force_preprocessing=false) 00828 { 00829 SG_INFO( "force: %d\n", force_preprocessing); 00830 00831 if ( sparse_feature_matrix && get_num_preproc() ) 00832 { 00833 for (int32_t i=0; i<get_num_preproc(); i++) 00834 { 00835 if ( (!is_preprocessed(i) || force_preprocessing) ) 00836 { 00837 set_preprocessed(i); 00838 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name()); 00839 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL) 00840 return false; 00841 } 00842 return true; 00843 } 00844 return true; 00845 } 00846 else 00847 { 00848 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n"); 00849 return false; 00850 } 00851 } 00852 00857 virtual int32_t get_size() { return sizeof(ST); } 00858 00864 bool obtain_from_simple(CSimpleFeatures<ST>* sf) 00865 { 00866 int32_t num_feat=0; 00867 int32_t num_vec=0; 00868 ST* fm=sf->get_feature_matrix(num_feat, num_vec); 00869 ASSERT(fm && num_feat>0 && num_vec>0); 00870 00871 return set_full_feature_matrix(fm, num_feat, num_vec); 00872 } 00873 00878 virtual inline int32_t get_num_vectors() { return num_vectors; } 00879 00884 inline int32_t get_num_features() { return num_features; } 00885 00897 inline int32_t set_num_features(int32_t num) 00898 { 00899 int32_t n=num_features; 00900 ASSERT(n<=num); 00901 num_features=num; 00902 return num_features; 00903 } 00904 00909 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; } 00910 00915 inline virtual EFeatureType get_feature_type(); 00916 00923 void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free) 00924 { 00925 if (feature_cache) 00926 feature_cache->unlock_entry(num); 00927 00928 if (free) 00929 delete[] feat_vec ; 00930 } 00931 00936 int64_t get_num_nonzero_entries() 00937 { 00938 int64_t num=0; 00939 for (int32_t i=0; i<num_vectors; i++) 00940 num+=sparse_feature_matrix[i].num_feat_entries; 00941 00942 return num; 00943 } 00944 00950 float64_t* compute_squared(float64_t* sq) 00951 { 00952 ASSERT(sq); 00953 00954 int32_t len=0; 00955 bool do_free=false; 00956 00957 for (int32_t i=0; i<this->get_num_vectors(); i++) 00958 { 00959 sq[i]=0; 00960 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free); 00961 00962 for (int32_t j=0; j<len; j++) 00963 sq[i] += vec[j].entry * vec[j].entry; 00964 00965 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free); 00966 } 00967 00968 return sq; 00969 } 00970 00983 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b) 00984 { 00985 int32_t i,j; 00986 int32_t alen, blen; 00987 bool afree, bfree; 00988 ASSERT(lhs); 00989 ASSERT(rhs); 00990 00991 TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree); 00992 TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree); 00993 ASSERT(avec); 00994 ASSERT(bvec); 00995 00996 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b]; 00997 00998 if (alen<=blen) 00999 { 01000 j=0; 01001 for (i=0; i<alen; i++) 01002 { 01003 int32_t a_feat_idx=avec[i].feat_index; 01004 01005 while ((j<blen) && (bvec[j].feat_index < a_feat_idx)) 01006 j++; 01007 01008 if ((j<blen) && (bvec[j].feat_index == a_feat_idx)) 01009 { 01010 result-=2*(avec[i].entry*bvec[j].entry); 01011 j++; 01012 } 01013 } 01014 } 01015 else 01016 { 01017 j=0; 01018 for (i=0; i<blen; i++) 01019 { 01020 int32_t b_feat_idx=bvec[i].feat_index; 01021 01022 while ((j<alen) && (avec[j].feat_index<b_feat_idx)) 01023 j++; 01024 01025 if ((j<alen) && (avec[j].feat_index == b_feat_idx)) 01026 { 01027 result-=2*(bvec[i].entry*avec[j].entry); 01028 j++; 01029 } 01030 } 01031 } 01032 01033 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree); 01034 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree); 01035 01036 return CMath::abs(result); 01037 } 01038 01043 void load(CFile* loader); 01044 01049 void save(CFile* writer); 01050 01058 CLabels* load_svmlight_file(char* fname, bool do_sort_features=true) 01059 { 01060 CLabels* lab=NULL; 01061 01062 size_t blocksize=1024*1024; 01063 size_t required_blocksize=blocksize; 01064 uint8_t* dummy=new uint8_t[blocksize]; 01065 FILE* f=fopen(fname, "ro"); 01066 01067 if (f) 01068 { 01069 free_sparse_feature_matrix(); 01070 num_vectors=0; 01071 num_features=0; 01072 01073 SG_INFO("counting line numbers in file %s\n", fname); 01074 size_t sz=blocksize; 01075 size_t block_offs=0; 01076 size_t old_block_offs=0; 01077 fseek(f, 0, SEEK_END); 01078 size_t fsize=ftell(f); 01079 rewind(f); 01080 01081 while (sz == blocksize) 01082 { 01083 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 01084 bool contains_cr=false; 01085 for (size_t i=0; i<sz; i++) 01086 { 01087 block_offs++; 01088 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 01089 { 01090 num_vectors++; 01091 contains_cr=true; 01092 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); 01093 old_block_offs=block_offs; 01094 } 01095 } 01096 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 01097 } 01098 01099 SG_INFO("found %d feature vectors\n", num_vectors); 01100 delete[] dummy; 01101 blocksize=required_blocksize; 01102 dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL 01103 01104 lab=new CLabels(num_vectors); 01105 sparse_feature_matrix=new TSparse<ST>[num_vectors]; 01106 01107 rewind(f); 01108 sz=blocksize; 01109 int32_t lines=0; 01110 while (sz == blocksize) 01111 { 01112 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 01113 01114 size_t old_sz=0; 01115 for (size_t i=0; i<sz; i++) 01116 { 01117 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) 01118 { 01119 size_t len=i-old_sz+1; 01120 uint8_t* data=&dummy[old_sz]; 01121 01122 for (int32_t j=0; j<len; j++) 01123 dummy[j]=data[j]; 01124 01125 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f); 01126 i=0; 01127 old_sz=0; 01128 sz+=len; 01129 } 01130 01131 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 01132 { 01133 01134 size_t len=i-old_sz; 01135 uint8_t* data=&dummy[old_sz]; 01136 01137 int32_t dims=0; 01138 for (int32_t j=0; j<len; j++) 01139 { 01140 if (data[j]==':') 01141 dims++; 01142 } 01143 01144 if (dims<=0) 01145 { 01146 SG_ERROR("Error in line %d - number of" 01147 " dimensions is %d line is %d characters" 01148 " long\n line_content:'%.*s'\n", lines, 01149 dims, len, len, (const char*) data); 01150 } 01151 01152 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims]; 01153 int32_t j=0; 01154 for (; j<len; j++) 01155 { 01156 if (data[j]==' ') 01157 { 01158 data[j]='\0'; 01159 01160 lab->set_label(lines, atof((const char*) data)); 01161 break; 01162 } 01163 } 01164 01165 int32_t d=0; 01166 j++; 01167 uint8_t* start=&data[j]; 01168 for (; j<len; j++) 01169 { 01170 if (data[j]==':') 01171 { 01172 data[j]='\0'; 01173 01174 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; 01175 num_features=CMath::max(num_features, feat[d].feat_index+1); 01176 01177 j++; 01178 start=&data[j]; 01179 for (; j<len; j++) 01180 { 01181 if (data[j]==' ' || data[j]=='\n') 01182 { 01183 data[j]='\0'; 01184 feat[d].entry=(ST) atof((const char*) start); 01185 d++; 01186 break; 01187 } 01188 } 01189 01190 if (j==len) 01191 { 01192 data[j]='\0'; 01193 feat[dims-1].entry=(ST) atof((const char*) start); 01194 } 01195 01196 j++; 01197 start=&data[j]; 01198 } 01199 } 01200 01201 sparse_feature_matrix[lines].vec_index=lines; 01202 sparse_feature_matrix[lines].num_feat_entries=dims; 01203 sparse_feature_matrix[lines].features=feat; 01204 01205 old_sz=i+1; 01206 lines++; 01207 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 01208 } 01209 } 01210 } 01211 SG_INFO("file successfully read\n"); 01212 fclose(f); 01213 } 01214 01215 delete[] dummy; 01216 01217 if (do_sort_features) 01218 sort_features(); 01219 01220 return lab; 01221 } 01222 01225 void sort_features() 01226 { 01227 ASSERT(get_num_preproc()==0); 01228 01229 if (!sparse_feature_matrix) 01230 SG_ERROR("Requires sparse feature matrix to be available in-memory\n"); 01231 01232 for (int32_t i=0; i<num_vectors; i++) 01233 { 01234 int32_t len=sparse_feature_matrix[i].num_feat_entries; 01235 01236 if (!len) 01237 continue; 01238 01239 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features; 01240 int32_t* feat_idx=new int32_t[len]; 01241 int32_t* orig_idx=new int32_t[len]; 01242 01243 for (int j=0; j<len; j++) 01244 { 01245 feat_idx[j]=sf_orig[j].feat_index; 01246 orig_idx[j]=j; 01247 } 01248 01249 CMath::qsort_index(feat_idx, orig_idx, len); 01250 01251 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len]; 01252 for (int j=0; j<len; j++) 01253 sf_new[j]=sf_orig[orig_idx[j]]; 01254 01255 sparse_feature_matrix[i].features=sf_new; 01256 01257 // sanity check 01258 for (int j=0; j<len-1; j++) 01259 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index); 01260 01261 delete[] orig_idx; 01262 delete[] feat_idx; 01263 delete[] sf_orig; 01264 } 01265 } 01266 01273 bool write_svmlight_file(char* fname, CLabels* label) 01274 { 01275 ASSERT(label); 01276 int32_t num=label->get_num_labels(); 01277 ASSERT(num>0); 01278 ASSERT(num==num_vectors); 01279 01280 FILE* f=fopen(fname, "wb"); 01281 01282 if (f) 01283 { 01284 for (int32_t i=0; i<num; i++) 01285 { 01286 fprintf(f, "%d ", (int32_t) label->get_int_label(i)); 01287 01288 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features; 01289 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries; 01290 01291 for (int32_t j=0; j<num_feat; j++) 01292 { 01293 if (j<num_feat-1) 01294 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 01295 else 01296 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 01297 } 01298 } 01299 01300 fclose(f); 01301 return true; 01302 } 01303 return false; 01304 } 01305 01313 virtual int32_t get_dim_feature_space() 01314 { 01315 return num_features; 01316 } 01317 01324 virtual float64_t dot(int32_t vec_idx1, int32_t vec_idx2) 01325 { 01326 bool afree, bfree; 01327 int32_t alen, blen; 01328 TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree); 01329 TSparseEntry<ST>* bvec=get_sparse_feature_vector(vec_idx2, blen, bfree); 01330 01331 float64_t result=sparse_dot(1, avec, alen, bvec, blen); 01332 01333 free_sparse_feature_vector(avec, vec_idx1, afree); 01334 free_sparse_feature_vector(bvec, vec_idx2, bfree); 01335 01336 return result; 01337 } 01338 01345 virtual float64_t dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 01346 { 01347 ASSERT(vec2); 01348 if (vec2_len!=num_features) 01349 { 01350 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 01351 vec2_len, num_features); 01352 } 01353 float64_t result=0; 01354 01355 bool vfree; 01356 int32_t vlen; 01357 TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree); 01358 01359 if (sv) 01360 { 01361 for (int32_t i=0; i<vlen; i++) 01362 result+=vec2[sv[i].feat_index]*sv[i].entry; 01363 } 01364 01365 free_sparse_feature_vector(sv, vec_idx1, vfree); 01366 01367 return result; 01368 } 01369 01371 struct sparse_feature_iterator 01372 { 01374 TSparseEntry<ST>* sv; 01376 int32_t vidx; 01378 int32_t num_feat_entries; 01380 bool vfree; 01381 01383 int32_t index; 01384 01386 void print_info() 01387 { 01388 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n", 01389 sv, vidx, num_feat_entries, index); 01390 } 01391 }; 01392 01402 virtual void* get_feature_iterator(int32_t vector_index) 01403 { 01404 if (vector_index>=num_vectors) 01405 { 01406 SG_ERROR("Index out of bounds (number of vectors %d, you " 01407 "requested %d)\n", num_vectors, vector_index); 01408 } 01409 01410 if (!sparse_feature_matrix) 01411 SG_ERROR("Requires a in-memory feature matrix\n"); 01412 01413 sparse_feature_iterator* it=new sparse_feature_iterator[1]; 01414 it->sv=get_sparse_feature_vector(vector_index, it->num_feat_entries, it->vfree); 01415 it->vidx=vector_index; 01416 it->index=0; 01417 01418 return it; 01419 } 01420 01431 virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator) 01432 { 01433 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01434 if (!it || it->index >= it->num_feat_entries) 01435 return false; 01436 01437 int32_t i=it->index++; 01438 01439 index = it->sv[i].feat_index; 01440 value = (float64_t) it->sv[i].entry; 01441 01442 return true; 01443 } 01444 01450 virtual void free_feature_iterator(void* iterator) 01451 { 01452 if (!iterator) 01453 return; 01454 01455 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01456 free_sparse_feature_vector(it->sv, it->vidx, it->vfree); 01457 delete[] it; 01458 } 01459 01461 inline virtual const char* get_name() const { return "SparseFeatures"; } 01462 01463 protected: 01474 virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL) 01475 { 01476 len=0; 01477 return NULL; 01478 } 01479 01480 protected: 01481 01483 int32_t num_vectors; 01484 01486 int32_t num_features; 01487 01489 TSparse<ST>* sparse_feature_matrix; 01490 01492 CCache< TSparseEntry<ST> >* feature_cache; 01493 }; 01494 01495 #ifndef DOXYGEN_SHOULD_SKIP_THIS 01496 #define GET_FEATURE_TYPE(sg_type, f_type) \ 01497 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \ 01498 { \ 01499 return f_type; \ 01500 } 01501 GET_FEATURE_TYPE(bool, F_BOOL) 01502 GET_FEATURE_TYPE(char, F_CHAR) 01503 GET_FEATURE_TYPE(uint8_t, F_BYTE) 01504 GET_FEATURE_TYPE(int16_t, F_SHORT) 01505 GET_FEATURE_TYPE(uint16_t, F_WORD) 01506 GET_FEATURE_TYPE(int32_t, F_INT) 01507 GET_FEATURE_TYPE(uint32_t, F_UINT) 01508 GET_FEATURE_TYPE(int64_t, F_LONG) 01509 GET_FEATURE_TYPE(uint64_t, F_ULONG) 01510 GET_FEATURE_TYPE(float32_t, F_SHORTREAL) 01511 GET_FEATURE_TYPE(float64_t, F_DREAL) 01512 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL) 01513 #undef GET_FEATURE_TYPE 01514 01515 #define LOAD(fname, sg_type) \ 01516 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader) \ 01517 { \ 01518 ASSERT(loader); \ 01519 TSparse<sg_type>* matrix=NULL; \ 01520 int32_t num_feat=0; \ 01521 int32_t num_vec=0; \ 01522 loader->fname(matrix, num_feat, num_vec); \ 01523 set_sparse_feature_matrix(matrix, num_feat, num_vec); \ 01524 } 01525 LOAD(get_bool_sparsematrix, bool) 01526 LOAD(get_char_sparsematrix, char) 01527 LOAD(get_byte_sparsematrix, uint8_t) 01528 LOAD(get_short_sparsematrix, int16_t) 01529 LOAD(get_word_sparsematrix, uint16_t) 01530 LOAD(get_int_sparsematrix, int32_t) 01531 LOAD(get_uint_sparsematrix, uint32_t) 01532 LOAD(get_long_sparsematrix, int64_t) 01533 LOAD(get_ulong_sparsematrix, uint64_t) 01534 LOAD(get_shortreal_sparsematrix, float32_t) 01535 LOAD(get_real_sparsematrix, float64_t) 01536 LOAD(get_longreal_sparsematrix, floatmax_t) 01537 #undef LOAD 01538 01539 #define WRITE(fname, sg_type) \ 01540 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer) \ 01541 { \ 01542 ASSERT(writer); \ 01543 writer->fname(sparse_feature_matrix, num_features, num_vectors); \ 01544 } 01545 WRITE(set_bool_sparsematrix, bool) 01546 WRITE(set_char_sparsematrix, char) 01547 WRITE(set_byte_sparsematrix, uint8_t) 01548 WRITE(set_short_sparsematrix, int16_t) 01549 WRITE(set_word_sparsematrix, uint16_t) 01550 WRITE(set_int_sparsematrix, int32_t) 01551 WRITE(set_uint_sparsematrix, uint32_t) 01552 WRITE(set_long_sparsematrix, int64_t) 01553 WRITE(set_ulong_sparsematrix, uint64_t) 01554 WRITE(set_shortreal_sparsematrix, float32_t) 01555 WRITE(set_real_sparsematrix, float64_t) 01556 WRITE(set_longreal_sparsematrix, floatmax_t) 01557 #undef WRITE 01558 #endif // DOXYGEN_SHOULD_SKIP_THIS 01559 } 01560 #endif /* _SPARSEFEATURES__H__ */