|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #ifndef _WEIGHTEDDEGREESTRINGKERNEL_H___ 00013 #define _WEIGHTEDDEGREESTRINGKERNEL_H___ 00014 00015 #include "lib/common.h" 00016 #include "lib/Trie.h" 00017 #include "kernel/StringKernel.h" 00018 #include "kernel/MultitaskKernelMklNormalizer.h" 00019 #include "features/StringFeatures.h" 00020 00021 00022 00023 00024 00025 00026 namespace shogun 00027 { 00028 00029 enum EWDKernType 00030 { 00031 E_WD=0, 00032 E_EXTERNAL=1, 00033 00034 E_BLOCK_CONST=2, 00035 E_BLOCK_LINEAR=3, 00036 E_BLOCK_SQPOLY=4, 00037 E_BLOCK_CUBICPOLY=5, 00038 E_BLOCK_EXP=6, 00039 E_BLOCK_LOG=7, 00040 E_BLOCK_EXTERNAL=8 00041 }; 00042 00043 00058 class CWeightedDegreeStringKernel: public CStringKernel<char> 00059 { 00060 public: 00061 00067 CWeightedDegreeStringKernel(int32_t degree, EWDKernType type=E_WD); 00068 00074 CWeightedDegreeStringKernel(float64_t* weights, int32_t degree); 00075 00082 CWeightedDegreeStringKernel( 00083 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree); 00084 00085 virtual ~CWeightedDegreeStringKernel(); 00086 00093 virtual bool init(CFeatures* l, CFeatures* r); 00094 00096 virtual void cleanup(); 00097 00105 EWDKernType get_type() const 00106 { 00107 return type; 00108 } 00109 00114 int32_t get_degree() const 00115 { 00116 return degree; 00117 } 00118 00124 int32_t get_max_mismatch() const 00125 { 00126 return max_mismatch; 00127 } 00128 00133 virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREE; } 00134 00139 virtual const char* get_name() const { return "WeightedDegree"; } 00140 00148 inline virtual bool init_optimization( 00149 int32_t count, int32_t *IDX, float64_t* alphas) 00150 { 00151 return init_optimization(count, IDX, alphas, -1); 00152 } 00153 00164 virtual bool init_optimization( 00165 int32_t count, int32_t *IDX, float64_t* alphas, int32_t tree_num); 00166 00171 virtual bool delete_optimization(); 00172 00178 virtual float64_t compute_optimized(int32_t idx) 00179 { 00180 if (get_is_initialized()) 00181 return compute_by_tree(idx); 00182 00183 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n"); 00184 return 0; 00185 } 00186 00191 static void* compute_batch_helper(void* p); 00192 00203 virtual void compute_batch( 00204 int32_t num_vec, int32_t* vec_idx, float64_t* target, 00205 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, 00206 float64_t factor=1.0); 00207 00211 inline virtual void clear_normal() 00212 { 00213 if (get_is_initialized()) 00214 { 00215 00216 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK) 00217 SG_ERROR("not implemented"); 00218 00219 tries->delete_trees(max_mismatch==0); 00220 set_is_initialized(false); 00221 } 00222 } 00223 00229 inline virtual void add_to_normal(int32_t idx, float64_t weight) 00230 { 00231 00232 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK) 00233 SG_ERROR("not implemented"); 00234 00235 if (max_mismatch==0) 00236 add_example_to_tree(idx, weight); 00237 else 00238 add_example_to_tree_mismatch(idx, weight); 00239 00240 set_is_initialized(true); 00241 } 00242 00247 inline virtual int32_t get_num_subkernels() 00248 { 00249 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK) 00250 return ((CMultitaskKernelMklNormalizer*)normalizer)->get_num_betas(); 00251 if (position_weights!=NULL) 00252 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ; 00253 if (length==0) 00254 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize); 00255 return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ; 00256 } 00257 00263 inline void compute_by_subkernel( 00264 int32_t idx, float64_t * subkernel_contrib) 00265 { 00266 00267 if (get_is_initialized()) 00268 { 00269 00270 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK) 00271 SG_ERROR("not implemented"); 00272 00273 compute_by_tree(idx, subkernel_contrib); 00274 return ; 00275 } 00276 00277 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n"); 00278 } 00279 00285 inline const float64_t* get_subkernel_weights(int32_t& num_weights) 00286 { 00287 00288 num_weights = get_num_subkernels(); 00289 00290 delete[] weights_buffer ; 00291 weights_buffer = new float64_t[num_weights]; 00292 00293 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK) 00294 for (int32_t i=0; i<num_weights; i++) 00295 weights_buffer[i] = ((CMultitaskKernelMklNormalizer*)normalizer)->get_beta(i); 00296 else if (position_weights!=NULL) 00297 for (int32_t i=0; i<num_weights; i++) 00298 weights_buffer[i] = position_weights[i*mkl_stepsize]; 00299 else 00300 for (int32_t i=0; i<num_weights; i++) 00301 weights_buffer[i] = weights[i*mkl_stepsize]; 00302 00303 return weights_buffer; 00304 } 00305 00311 inline void set_subkernel_weights( 00312 float64_t* weights2, int32_t num_weights2) 00313 { 00314 int32_t num_weights = get_num_subkernels(); 00315 if (num_weights!=num_weights2) 00316 SG_ERROR( "number of weights do not match\n"); 00317 00318 00319 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK) 00320 for (int32_t i=0; i<num_weights; i++) 00321 ((CMultitaskKernelMklNormalizer*)normalizer)->set_beta(i, weights2[i]); 00322 else if (position_weights!=NULL) 00323 { 00324 for (int32_t i=0; i<num_weights; i++) 00325 { 00326 for (int32_t j=0; j<mkl_stepsize; j++) 00327 { 00328 if (i*mkl_stepsize+j<seq_length) 00329 position_weights[i*mkl_stepsize+j] = weights2[i]; 00330 } 00331 } 00332 } 00333 else if (length==0) 00334 { 00335 for (int32_t i=0; i<num_weights; i++) 00336 { 00337 for (int32_t j=0; j<mkl_stepsize; j++) 00338 { 00339 if (i*mkl_stepsize+j<get_degree()) 00340 weights[i*mkl_stepsize+j] = weights2[i]; 00341 } 00342 } 00343 } 00344 else 00345 { 00346 for (int32_t i=0; i<num_weights; i++) 00347 { 00348 for (int32_t j=0; j<mkl_stepsize; j++) 00349 { 00350 if (i*mkl_stepsize+j<get_degree()*length) 00351 weights[i*mkl_stepsize+j] = weights2[i]; 00352 } 00353 } 00354 } 00355 } 00356 00361 virtual bool set_normalizer(CKernelNormalizer* normalizer_) { 00362 00363 if (normalizer_ && strcmp(normalizer_->get_name(),"MultitaskKernelTreeNormalizer")==0) { 00364 unset_property(KP_LINADD); 00365 unset_property(KP_BATCHEVALUATION); 00366 } 00367 else 00368 { 00369 set_property(KP_LINADD); 00370 set_property(KP_BATCHEVALUATION); 00371 } 00372 00373 00374 return CStringKernel<char>::set_normalizer(normalizer_); 00375 00376 } 00377 00378 // other kernel tree operations 00384 float64_t *compute_abs_weights(int32_t & len); 00385 00392 void compute_by_tree(int32_t idx, float64_t *LevelContrib); 00393 00398 bool is_tree_initialized() { return tree_initialized; } 00399 00405 inline float64_t *get_degree_weights(int32_t& d, int32_t& len) 00406 { 00407 d=degree; 00408 len=length; 00409 return weights; 00410 } 00411 00417 inline float64_t *get_weights(int32_t& num_weights) 00418 { 00419 00420 if (normalizer && normalizer->get_normalizer_type()==N_MULTITASK) 00421 SG_ERROR("not implemented"); 00422 00423 if (position_weights!=NULL) 00424 { 00425 num_weights = seq_length ; 00426 return position_weights ; 00427 } 00428 if (length==0) 00429 num_weights = degree ; 00430 else 00431 num_weights = degree*length ; 00432 return weights; 00433 } 00434 00440 inline float64_t *get_position_weights(int32_t& len) 00441 { 00442 len=seq_length; 00443 return position_weights; 00444 } 00445 00451 bool set_wd_weights_by_type(EWDKernType type); 00452 00459 void set_wd_weights(float64_t* p_weights, int32_t d) 00460 { 00461 set_weights(p_weights,d,0); 00462 } 00463 00470 bool set_weights(float64_t* weights, int32_t d, int32_t len); 00471 00478 bool set_position_weights(float64_t* position_weights, int32_t len=0); 00479 00484 bool init_block_weights(); 00485 00490 bool init_block_weights_from_wd(); 00491 00496 bool init_block_weights_from_wd_external(); 00497 00502 bool init_block_weights_const(); 00503 00508 bool init_block_weights_linear(); 00509 00514 bool init_block_weights_sqpoly(); 00515 00520 bool init_block_weights_cubicpoly(); 00521 00526 bool init_block_weights_exp(); 00527 00532 bool init_block_weights_log(); 00533 00538 bool init_block_weights_external(); 00539 00544 bool delete_position_weights() 00545 { 00546 delete[] position_weights; 00547 position_weights=NULL; 00548 return true; 00549 } 00550 00556 bool set_max_mismatch(int32_t max); 00557 00562 inline int32_t get_max_mismatch() { return max_mismatch; } 00563 00569 inline bool set_degree(int32_t deg) { degree=deg; return true; } 00570 00575 inline int32_t get_degree() { return degree; } 00576 00582 inline bool set_use_block_computation(bool block) 00583 { 00584 block_computation=block; 00585 return true; 00586 } 00587 00592 inline bool get_use_block_computation() { return block_computation; } 00593 00599 inline bool set_mkl_stepsize(int32_t step) 00600 { 00601 if (step<1) 00602 SG_ERROR("Stepsize must be a positive integer\n"); 00603 mkl_stepsize=step; 00604 return true; 00605 } 00606 00611 inline int32_t get_mkl_stepsize() { return mkl_stepsize; } 00612 00618 inline bool set_which_degree(int32_t which) 00619 { 00620 which_degree=which; 00621 return true; 00622 } 00623 00628 inline int32_t get_which_degree() { return which_degree; } 00629 00630 protected: 00632 void create_empty_tries(); 00633 00639 void add_example_to_tree(int32_t idx, float64_t weight); 00640 00647 void add_example_to_single_tree( 00648 int32_t idx, float64_t weight, int32_t tree_num); 00649 00655 void add_example_to_tree_mismatch(int32_t idx, float64_t weight); 00656 00663 void add_example_to_single_tree_mismatch( 00664 int32_t idx, float64_t weight, int32_t tree_num); 00665 00671 float64_t compute_by_tree(int32_t idx); 00672 00681 float64_t compute(int32_t idx_a, int32_t idx_b); 00682 00691 float64_t compute_with_mismatch( 00692 char* avec, int32_t alen, char* bvec, int32_t blen); 00693 00702 float64_t compute_without_mismatch( 00703 char* avec, int32_t alen, char* bvec, int32_t blen); 00704 00713 float64_t compute_without_mismatch_matrix( 00714 char* avec, int32_t alen, char* bvec, int32_t blen); 00715 00724 float64_t compute_using_block(char* avec, int32_t alen, 00725 char* bvec, int32_t blen); 00726 00728 virtual void remove_lhs(); 00729 00730 00731 #ifdef HAVE_BOOST_SERIALIZATION 00732 00733 00734 00735 00736 private: 00737 00738 // serialization needs to split up in save/load because 00739 // the serialization of pointers to natives (int* & friends) 00740 // requires a workaround 00741 friend class ::boost::serialization::access; 00742 00743 // friend std::ostream & operator<<(std::ostream &os, const CWeightedDegreeStringKernel &gp); 00744 //template<class Archive> 00745 //friend void ::boost::serialization::save_construct_data(Archive & ar, const CWeightedDegreeStringKernel* t, const unsigned int file_version); 00746 template<class Archive> 00747 void save(Archive & ar, const unsigned int archive_version) const 00748 { 00749 00750 SG_DEBUG("archiving CWeightedDegreeStringKernel\n"); 00751 00752 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this); 00753 00754 00757 ar & mkl_stepsize ; 00758 //ar & degree; 00759 ar & length; 00760 ar & max_mismatch ; 00761 00762 //for (int32_t i=0; i<degree*(1+max_mismatch); i++) 00763 // ar & weights[i]; 00764 00765 //TODO how long? 00766 //float64_t* position_weights ; 00767 //float64_t* weights_buffer ; 00768 00769 ar & seq_length ; 00770 00771 ar & initialized ; 00772 ar & block_computation; 00773 //ar & use_normalization ; 00774 00775 //ar & normalization_const; 00776 00777 ar & num_block_weights_external; 00778 for (int32_t i=0; i < num_block_weights_external; ++i) 00779 { 00780 ar & block_weights_external[i]; 00781 } 00782 00783 //TODO how long 00784 //float64_t* block_weights; 00785 //ar & type; 00786 ar & which_degree; 00787 00788 //TODO implement 00789 //CTrie<DNATrie> tries ; 00790 //ar & tree_initialized ; 00791 00792 00793 //CWeightedDegreeStringKernel* tmp = const_cast<CWeightedDegreeStringKernel*>(this); 00794 //tmp->create_empty_tries(); 00795 //create_empty_tries(); 00796 00797 SG_DEBUG("done with CWeightedDegreeStringKernel\n"); 00798 00799 } 00800 00801 template<class Archive> 00802 void load(Archive & ar, const unsigned int archive_version) 00803 { 00804 SG_DEBUG("archiving CWeightedDegreeStringKernel\n"); 00805 00806 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this); 00807 00808 00811 ar & mkl_stepsize ; 00812 ar & degree; 00813 ar & length; 00814 ar & max_mismatch ; 00815 00816 //weights=new float64_t[degree*(1+max_mismatch)]; 00817 //for (int32_t i=0; i<degree*(1+max_mismatch); i++) 00818 // ar & weights[i]; 00819 00820 00821 //TODO how long? 00822 //float64_t* position_weights ; 00823 //float64_t* weights_buffer ; 00824 00825 ar & seq_length ; 00826 00827 ar & initialized ; 00828 ar & block_computation; 00829 //ar & use_normalization ; 00830 00831 //ar & normalization_const; 00832 00833 ar & num_block_weights_external; 00834 //float64_t* block_weights_external; 00835 block_weights_external = new float64_t[num_block_weights_external]; 00836 for (int32_t i=0; i < num_block_weights_external; ++i) 00837 { 00838 ar & block_weights_external[i]; 00839 } 00840 00841 //TODO how long 00842 //float64_t* block_weights; 00843 //ar & type; 00844 ar & which_degree; 00845 00846 //TODO implement 00847 //CTrie<DNATrie> tries ; 00848 //ar & tree_initialized ; 00849 00850 SG_DEBUG("done with CWeightedDegreeStringKernel\n"); 00851 00852 } 00853 00854 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER(); 00855 00856 00857 public: 00858 00859 virtual std::string toString() const 00860 { 00861 std::ostringstream s; 00862 00863 ::boost::archive::text_oarchive oa(s); 00864 00865 oa << *this; 00866 00867 return s.str(); 00868 } 00869 00870 virtual void fromString(std::string str) 00871 { 00872 00873 std::istringstream is(str); 00874 00875 ::boost::archive::text_iarchive ia(is); 00876 00877 ia >> *this; 00878 00879 } 00880 00881 #endif //HAVE_BOOST_SERIALIZATION 00882 00883 00884 protected: 00888 float64_t* weights; 00890 float64_t* position_weights; 00892 float64_t* weights_buffer; 00894 int32_t mkl_stepsize; 00896 int32_t degree; 00898 int32_t length; 00899 00901 int32_t max_mismatch; 00903 int32_t seq_length; 00904 00906 bool initialized; 00907 00909 bool block_computation; 00910 00912 int32_t num_block_weights_external; 00914 float64_t* block_weights_external; 00915 00917 float64_t* block_weights; 00919 EWDKernType type; 00921 int32_t which_degree; 00922 00924 CTrie<DNATrie>* tries; 00925 00927 bool tree_initialized; 00928 00930 CAlphabet* alphabet; 00931 }; 00932 00933 } 00934 00935 00936 00937 #ifdef HAVE_BOOST_SERIALIZATION 00938 #include <boost/serialization/export.hpp> 00939 00940 #endif //HAVE_BOOST_SERIALIZATION 00941 00942 00943 #ifdef HAVE_BOOST_SERIALIZATION 00944 00945 namespace boost 00946 { 00947 namespace serialization 00948 { 00949 template<class Archive> 00950 //inline void save_construct_data(Archive & ar, const shogun::CWeightedDegreeStringKernel* const t, const unsigned int file_version) 00951 inline void save_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel* t, const unsigned int file_version) 00952 { 00953 00954 std::cout << "saving WDK from non-defaultconstruct data works" << std::endl; 00955 00956 //CWeightedDegreeStringKernel(INT size, EWDKernType type, INT degree, INT max_mismatch, bool use_normalization=true, bool block_computation=false, INT mkl_stepsize=1, INT which_deg=-1); 00957 00958 //ar << t->cache_size; 00959 00960 ar << t->type; 00961 00962 ar << t->degree; 00963 00964 //ar << t->max_mismatch; 00965 /* 00966 00967 ar.register_type(static_cast<shogun::CStringFeatures<char> *>(NULL)); 00968 00969 00970 const shogun::CStringFeatures<char>* const lhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_lhs()); 00971 00972 const shogun::CStringFeatures<char>* const rhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_rhs()); 00973 00974 //CStringFeatures<char>* lhs = (CStringFeatures<char>*) (const_cast<CWeightedDegreeStringKernel*>(t)->get_lhs()); 00975 //CStringFeatures<char>* rhs = (CStringFeatures<char>*) (const_cast<CWeightedDegreeStringKernel*>(t)->get_rhs()); 00976 00977 // const CFeatures* const lhs = t->get_lhs(); 00978 // const CFeatures* const rhs = t->get_rhs(); 00979 00980 ar << lhs; 00981 ar << rhs; 00982 00983 //ar << dynamic_cast<CStringFeatures<char>*>(rhs); 00984 //ar << t->get_lhs(); 00985 //ar << t->get_rhs(); 00986 */ 00987 std::cout << "done saving WDK from non-defaultconstruct data" << std::endl; 00988 00989 } 00990 00991 template<class Archive> 00992 inline void load_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel * t, const unsigned int file_version) 00993 { 00994 00995 std::cout << "loading WDK from non-defaultconstruct data" << std::endl; 00996 00997 00998 00999 shogun::EWDKernType type; 01000 int32_t degree; 01001 01002 ar >> type; 01003 ar >> degree; 01004 /* 01005 int32_t size; 01006 int32_t max_mismatch; 01007 01008 ar >> size; 01009 ar >> type; 01010 ar >> degree; 01011 ar >> max_mismatch; 01012 01013 // ::new(t)CWeightedDegreeStringKernel(size, type, degree, max_mismatch); 01014 01015 shogun::CStringFeatures<char>* lhs; 01016 shogun::CStringFeatures<char>* rhs; 01017 01018 01019 ar >> lhs; 01020 ar >> rhs; 01021 01022 ::new(t)shogun::CWeightedDegreeStringKernel(lhs, rhs, degree); 01023 */ 01024 01025 ::new(t)shogun::CWeightedDegreeStringKernel(degree, type); 01026 //t->set_max_mismatch(max_mismatch); 01027 01028 std::cout << "done loading WDK from non-defaultconstruct data" << std::endl; 01029 } 01030 } // serialization 01031 } // namespace boost 01032 #endif //HAVE_BOOST_SERIALIZATION 01033 01034 //BOOST_CLASS_EXPORT_KEY2(shogun::CWeightedDegreeStringKernel, "CWeightedDegreeStringKernel"); 01035 01036 01037 #endif /* _WEIGHTEDDEGREESTRINGKERNEL_H__ */