SHOGUN v0.9.3
Kernel.h
Go to the documentation of this file.
00001 /*
00002  * EXCEPT FOR THE KERNEL CACHING FUNCTIONS WHICH ARE (W) THORSTEN JOACHIMS
00003  * COPYRIGHT (C) 1999  UNIVERSITAET DORTMUND - ALL RIGHTS RESERVED
00004  *
00005  * this program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 3 of the License, or
00008  * (at your option) any later version.
00009  *
00010  * Written (W) 1999-2009 Soeren Sonnenburg
00011  * Written (W) 1999-2008 Gunnar Raetsch
00012  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00013  */
00014 
00015 #ifndef _KERNEL_H___
00016 #define _KERNEL_H___
00017 
00018 #include "lib/common.h"
00019 #include "lib/Signal.h"
00020 #include "lib/File.h"
00021 #include "lib/Mathematics.h"
00022 #include "base/SGObject.h"
00023 #include "features/Features.h"
00024 #include "kernel/KernelNormalizer.h"
00025 
00026 
00027 namespace shogun
00028 {
00029     class CFile;
00030     class CFeatures;
00031     class CKernelNormalizer;
00032     enum EFeatureType;
00033     enum EFeatureClass;
00034 
00035 #ifdef USE_SHORTREAL_KERNELCACHE
00036     typedef float32_t KERNELCACHE_ELEM;
00037 #else
00038     typedef float64_t KERNELCACHE_ELEM;
00039 #endif
00040 
00041 typedef int64_t KERNELCACHE_IDX;
00042 
00043 
00044 enum EOptimizationType
00045 {
00046     FASTBUTMEMHUNGRY,
00047     SLOWBUTMEMEFFICIENT
00048 };
00049 
00050 enum EKernelType
00051 {
00052     K_UNKNOWN = 0,
00053     K_LINEAR = 10,
00054     K_SPARSELINEAR = 11,
00055     K_POLY = 20,
00056     K_GAUSSIAN = 30,
00057     K_SPARSEGAUSSIAN = 31,
00058     K_GAUSSIANSHIFT = 32,
00059     K_GAUSSIANMATCH = 33,
00060     K_HISTOGRAM = 40,
00061     K_SALZBERG = 41,
00062     K_LOCALITYIMPROVED = 50,
00063     K_SIMPLELOCALITYIMPROVED = 60,
00064     K_FIXEDDEGREE = 70,
00065     K_WEIGHTEDDEGREE =    80,
00066     K_WEIGHTEDDEGREEPOS = 81,
00067     K_WEIGHTEDDEGREERBF = 82,
00068     K_WEIGHTEDCOMMWORDSTRING = 90,
00069     K_POLYMATCH = 100,
00070     K_ALIGNMENT = 110,
00071     K_COMMWORDSTRING = 120,
00072     K_COMMULONGSTRING = 121,
00073     K_SPECTRUMMISMATCHRBF = 122,
00074     K_COMBINED = 140,
00075     K_AUC = 150,
00076     K_CUSTOM = 160,
00077     K_SIGMOID = 170,
00078     K_CHI2 = 180,
00079     K_DIAG = 190,
00080     K_CONST = 200,
00081     K_DISTANCE = 220,
00082     K_LOCALALIGNMENT = 230,
00083     K_PYRAMIDCHI2 = 240,
00084     K_OLIGO = 250,
00085     K_MATCHWORD = 260,
00086     K_TPPK = 270,
00087     K_REGULATORYMODULES = 280
00088 };
00089 
00090 enum EKernelProperty
00091 {
00092     KP_NONE = 0,
00093     KP_LINADD = 1,  // Kernels that can be optimized via doing normal updates w + dw
00094     KP_KERNCOMBINATION = 2, // Kernels that are infact a linear combination of subkernels K=\sum_i b_i*K_i
00095     KP_BATCHEVALUATION = 4  // Kernels that can on the fly generate normals in linadd and more quickly/memory efficient process batches instead of single examples
00096 };
00097 
00099 template <class T> struct K_THREAD_PARAM
00100 {
00102     CKernel* kernel;
00104     int32_t start;
00106     int32_t end;
00108     int32_t total_start;
00110     int32_t total_end;
00112     int32_t m;
00114     int32_t n;
00116     T* result;
00118     bool symmetric;
00120     bool verbose;
00121 };
00122 
00123 class CSVM;
00124 
00150 class CKernel : public CSGObject
00151 {
00152     friend class CVarianceKernelNormalizer;
00153     friend class CSqrtDiagKernelNormalizer;
00154     friend class CAvgDiagKernelNormalizer;
00155     friend class CRidgeKernelNormalizer;
00156     friend class CFirstElementKernelNormalizer;
00157     friend class CMultitaskKernelNormalizer;
00158     friend class CMultitaskKernelMklNormalizer;
00159     friend class CMultitaskKernelMaskNormalizer;
00160     friend class CMultitaskKernelMaskPairNormalizer;
00161     friend class CTanimotoKernelNormalizer;
00162     friend class CDiceKernelNormalizer;
00163 
00164     public:
00165 
00169         CKernel();
00170 
00171 
00176         CKernel(int32_t size);
00177 
00184         CKernel(CFeatures* l, CFeatures* r, int32_t size);
00185 
00186         virtual ~CKernel();
00187 
00195         inline float64_t kernel(int32_t idx_a, int32_t idx_b)
00196         {
00197             if (idx_a<0 || idx_b<0 || idx_a>=num_lhs || idx_b>=num_rhs)
00198             {
00199                 SG_ERROR("Index out of Range: idx_a=%d/%d idx_b=%d/%d\n",
00200                         idx_a,num_lhs, idx_b,num_rhs);
00201             }
00202 
00203             return normalizer->normalize(compute(idx_a, idx_b), idx_a, idx_b);
00204         }
00205 
00212         void get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n);
00213 
00221         template <class T>
00222         T* get_kernel_matrix(int32_t &m, int32_t &n, T* target)
00223         {
00224             T* result = NULL;
00225 
00226             if (!has_features())
00227                 SG_ERROR( "no features assigned to kernel\n");
00228 
00229             if (target && (m!=get_num_vec_lhs() ||
00230                         n!=get_num_vec_rhs()) )
00231             {
00232                 SG_ERROR( "kernel matrix size mismatch\n");
00233             }
00234 
00235             m=get_num_vec_lhs();
00236             n=get_num_vec_rhs();
00237 
00238             int64_t total_num = int64_t(m)*n;
00239 
00240             // if lhs == rhs and sizes match assume k(i,j)=k(j,i)
00241             bool symmetric= (lhs && lhs==rhs && m==n);
00242 
00243             SG_DEBUG( "returning kernel matrix of size %dx%d\n", m, n);
00244 
00245             if (target)
00246                 result=target;
00247             else
00248                 result=new T[total_num];
00249 
00250             int32_t num_threads=parallel->get_num_threads();
00251             if (num_threads < 2)
00252             {
00253                 K_THREAD_PARAM<T> params;
00254                 params.kernel=this;
00255                 params.result=result;
00256                 params.start=0;
00257                 params.end=m;
00258                 params.total_start=0;
00259                 params.total_end=total_num;
00260                 params.n=n;
00261                 params.m=m;
00262                 params.symmetric=symmetric;
00263                 params.verbose=true;
00264                 get_kernel_matrix_helper<T>((void*) &params);
00265             }
00266             else
00267             {
00268                 pthread_t* threads = new pthread_t[num_threads-1];
00269                 K_THREAD_PARAM<T>* params = new K_THREAD_PARAM<T>[num_threads];
00270                 int64_t step= total_num/num_threads;
00271 
00272                 int32_t t;
00273 
00274                 for (t=0; t<num_threads-1; t++)
00275                 {
00276                     params[t].kernel = this;
00277                     params[t].result = result;
00278                     params[t].start = compute_row_start(t*step, n, symmetric);
00279                     params[t].end = compute_row_start((t+1)*step, n, symmetric);
00280                     params[t].total_start=t*step;
00281                     params[t].total_end=(t+1)*step;
00282                     params[t].n=n;
00283                     params[t].m=m;
00284                     params[t].symmetric=symmetric;
00285                     params[t].verbose=false;
00286                     pthread_create(&threads[t], NULL,
00287                             CKernel::get_kernel_matrix_helper<T>, (void*)&params[t]);
00288                 }
00289 
00290                 params[t].kernel = this;
00291                 params[t].result = result;
00292                 params[t].start = compute_row_start(t*step, n, symmetric);
00293                 params[t].end = m;
00294                 params[t].total_start=t*step;
00295                 params[t].total_end=total_num;
00296                 params[t].n=n;
00297                 params[t].m=m;
00298                 params[t].symmetric=symmetric;
00299                 params[t].verbose=true;
00300                 get_kernel_matrix_helper<T>(&params[t]);
00301 
00302                 for (t=0; t<num_threads-1; t++)
00303                     pthread_join(threads[t], NULL);
00304 
00305                 delete[] params;
00306                 delete[] threads;
00307             }
00308 
00309             SG_DONE();
00310 
00311             return result;
00312         }
00313 
00314 
00325         virtual bool init(CFeatures* lhs, CFeatures* rhs);
00326 
00331         virtual bool set_normalizer(CKernelNormalizer* normalizer);
00332 
00337         virtual CKernelNormalizer* get_normalizer();
00338 
00342         virtual bool init_normalizer();
00343 
00350         virtual void cleanup();
00351 
00356         void load(CFile* loader);
00357 
00362         void save(CFile* writer);
00363 
00368         inline CFeatures* get_lhs() { SG_REF(lhs); return lhs; }
00369 
00374         inline CFeatures* get_rhs() { SG_REF(rhs); return rhs; }
00375 
00380         virtual inline int32_t get_num_vec_lhs()
00381         {
00382             return num_lhs;
00383         }
00384 
00389         virtual inline int32_t get_num_vec_rhs()
00390         {
00391             return num_rhs;
00392         }
00393 
00398         virtual inline bool has_features()
00399         {
00400             return lhs && rhs;
00401         }
00402 
00407         inline bool lhs_equals_rhs()
00408         {
00409             return lhs==rhs;
00410         }
00411 
00413         virtual void remove_lhs_and_rhs();
00414 
00416         virtual void remove_lhs();
00417 
00419         virtual void remove_rhs();
00420 
00428         virtual EKernelType get_kernel_type()=0 ;
00429 
00436         virtual EFeatureType get_feature_type()=0;
00437 
00444         virtual EFeatureClass get_feature_class()=0;
00445 
00450         inline void set_cache_size(int32_t size)
00451         {
00452             cache_size = size;
00453 #ifdef USE_SVMLIGHT
00454             cache_reset();
00455 #endif //USE_SVMLIGHT
00456         }
00457 
00462         inline int32_t get_cache_size() { return cache_size; }
00463 
00464 #ifdef USE_SVMLIGHT
00465 
00466         inline void cache_reset() { resize_kernel_cache(cache_size); }
00467 
00472         inline int32_t get_max_elems_cache() { return kernel_cache.max_elems; }
00473 
00478         inline int32_t get_activenum_cache() { return kernel_cache.activenum; }
00479 
00487         void get_kernel_row(
00488             int32_t docnum, int32_t *active2dnum, float64_t *buffer,
00489             bool full_line=false);
00490 
00495         void cache_kernel_row(int32_t x);
00496 
00502         void cache_multiple_kernel_rows(int32_t* key, int32_t varnum);
00503 
00505         void kernel_cache_reset_lru();
00506 
00513         void kernel_cache_shrink(
00514             int32_t totdoc, int32_t num_shrink, int32_t *after);
00515 
00521         void resize_kernel_cache(KERNELCACHE_IDX size,
00522             bool regression_hack=false);
00523 
00528         inline void set_time(int32_t t)
00529         {
00530             kernel_cache.time=t;
00531         }
00532 
00538         inline int32_t kernel_cache_touch(int32_t cacheidx)
00539         {
00540             if(kernel_cache.index[cacheidx] != -1)
00541             {
00542                 kernel_cache.lru[kernel_cache.index[cacheidx]]=kernel_cache.time;
00543                 return(1);
00544             }
00545             return(0);
00546         }
00547 
00553         inline int32_t kernel_cache_check(int32_t cacheidx)
00554         {
00555             return(kernel_cache.index[cacheidx] >= 0);
00556         }
00557 
00562         inline int32_t kernel_cache_space_available()
00563         {
00564             return(kernel_cache.elems < kernel_cache.max_elems);
00565         }
00566 
00572         void kernel_cache_init(int32_t size, bool regression_hack=false);
00573 
00575         void kernel_cache_cleanup();
00576 
00577 #endif //USE_SVMLIGHT
00578 
00580         void list_kernel();
00581 
00587         inline bool has_property(EKernelProperty p) { return (properties & p) != 0; }
00588 
00592         virtual void clear_normal();
00593 
00599         virtual void add_to_normal(int32_t vector_idx, float64_t weight);
00600 
00605         inline EOptimizationType get_optimization_type() { return opt_type; }
00606 
00611         virtual inline void set_optimization_type(EOptimizationType t) { opt_type=t;}
00612 
00617         inline bool get_is_initialized() { return optimization_initialized; }
00618 
00626         virtual bool init_optimization(
00627             int32_t count, int32_t *IDX, float64_t *weights);
00628 
00633         virtual bool delete_optimization();
00634 
00640         bool init_optimization_svm(CSVM * svm) ;
00641 
00647         virtual float64_t compute_optimized(int32_t vector_idx);
00648 
00657         virtual void compute_batch(
00658             int32_t num_vec, int32_t* vec_idx, float64_t* target,
00659             int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00660             float64_t factor=1.0);
00661 
00666         inline float64_t get_combined_kernel_weight() { return combined_kernel_weight; }
00667 
00672         inline void set_combined_kernel_weight(float64_t nw) { combined_kernel_weight=nw; }
00673 
00678         virtual int32_t get_num_subkernels();
00679 
00685         virtual void compute_by_subkernel(
00686             int32_t vector_idx, float64_t * subkernel_contrib);
00687 
00693         virtual const float64_t* get_subkernel_weights(int32_t& num_weights);
00694 
00700         virtual void set_subkernel_weights(
00701             float64_t* weights, int32_t num_weights);
00702 
00703     protected:
00708         inline void set_property(EKernelProperty p)
00709         {
00710             properties |= p;
00711         }
00712 
00717         inline void unset_property(EKernelProperty p)
00718         {
00719             properties &= (properties | p) ^ p;
00720         }
00721 
00726         inline void set_is_initialized(bool p_init) { optimization_initialized=p_init; }
00727 
00738         virtual float64_t compute(int32_t x, int32_t y)=0;
00739 
00746         int32_t compute_row_start(int64_t offs, int32_t n, bool symmetric)
00747         {
00748             int32_t i_start;
00749 
00750             if (symmetric)
00751                 i_start=(int32_t) CMath::floor(n-CMath::sqrt(CMath::sq((float64_t) n)-offs));
00752             else
00753                 i_start=(int32_t) (offs/int64_t(n));
00754 
00755             return i_start;
00756         }
00757 
00762         template <class T>
00763         static void* get_kernel_matrix_helper(void* p)
00764         {
00765             K_THREAD_PARAM<T>* params= (K_THREAD_PARAM<T>*) p;
00766             int32_t i_start=params->start;
00767             int32_t i_end=params->end;
00768             CKernel* k=params->kernel;
00769             T* result=params->result;
00770             bool symmetric=params->symmetric;
00771             int32_t n=params->n;
00772             int32_t m=params->m;
00773             bool verbose=params->verbose;
00774             int64_t total_start=params->total_start;
00775             int64_t total_end=params->total_end;
00776             int64_t total=total_start;
00777 
00778             for (int32_t i=i_start; i<i_end; i++)
00779             {
00780                 int32_t j_start=0;
00781 
00782                 if (symmetric)
00783                     j_start=i;
00784 
00785                 for (int32_t j=j_start; j<n; j++)
00786                 {
00787                     float64_t v=k->kernel(i,j);
00788                     result[i+j*m]=v;
00789 
00790                     if (symmetric && i!=j)
00791                         result[j+i*m]=v;
00792 
00793                     if (verbose)
00794                     {
00795                         total++;
00796 
00797                         if (symmetric && i!=j)
00798                             total++;
00799 
00800                         if (total%100 == 0)
00801                             k->SG_PROGRESS(total, total_start, total_end);
00802 
00803                         if (CSignal::cancel_computations())
00804                             break;
00805                     }
00806                 }
00807 
00808             }
00809 
00810             return NULL;
00811         }
00812 
00813 #ifdef USE_SVMLIGHT
00814 
00815 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00816         struct KERNEL_CACHE {
00818             int32_t   *index;
00820             int32_t   *invindex;
00822             int32_t   *active2totdoc;
00824             int32_t   *totdoc2active;
00826             int32_t   *lru;
00828             int32_t   *occu;
00830             int32_t   elems;
00832             int32_t   max_elems;
00834             int32_t   time;
00836             int32_t   activenum;
00837 
00839             KERNELCACHE_ELEM  *buffer;
00841             KERNELCACHE_IDX   buffsize;
00842         };
00843 
00845         struct S_KTHREAD_PARAM
00846         {
00848             CKernel* kernel;
00850             KERNEL_CACHE* kernel_cache;
00852             KERNELCACHE_ELEM** cache;
00854             int32_t* uncached_rows;
00856             int32_t num_uncached;
00858             uint8_t* needs_computation;
00860             int32_t start;
00862             int32_t end;
00864             int32_t num_vectors;
00865         };
00866 #endif // DOXYGEN_SHOULD_SKIP_THIS
00867 
00869         static void* cache_multiple_kernel_row_helper(void* p);
00870 
00872         void   kernel_cache_free(int32_t cacheidx);
00873         int32_t   kernel_cache_malloc();
00874         int32_t   kernel_cache_free_lru();
00875         KERNELCACHE_ELEM *kernel_cache_clean_and_malloc(int32_t cacheidx);
00876 #endif //USE_SVMLIGHT
00877 
00878 
00879 
00880 #ifdef HAVE_BOOST_SERIALIZATION
00881     private:
00882 
00883         friend class ::boost::serialization::access;
00884         template<class Archive>
00885             void serialize(Archive & ar, const unsigned int archive_version)
00886             {
00887 
00888                 SG_DEBUG("archiving CKernel\n");
00889 
00890                 ar & ::boost::serialization::base_object<CSGObject>(*this);
00891 
00892                 ar & cache_size;
00893 
00894 #ifdef USE_SVMLIGHT
00895                 //TODO
00896                 //KERNEL_CACHE kernel_cache;
00897 #endif //USE_SVMLIGHT
00898 
00899                 //TODO
00900                 //KERNELCACHE_ELEM* kernel_matrix;
00901 
00902                 //TODO
00903                 //SHORTREAL * precomputed_matrix ;
00904                 //ar & precompute_subkernel_matrix ;
00905                 //ar & precompute_matrix ;
00906 
00907                 ar & rhs;
00908                 ar & lhs;
00909 
00910                 ar & combined_kernel_weight;
00911 
00912                 ar & optimization_initialized;
00913 
00914                 ar & opt_type;
00915 
00916                 ar & properties;
00917 
00918                 SG_DEBUG("done with CKernel\n");
00919 
00920             }
00921 
00922 #endif //HAVE_BOOST_SERIALIZATION
00923 
00924 
00925 
00926     protected:
00928         int32_t cache_size;
00929 
00930 #ifdef USE_SVMLIGHT
00931 
00932         KERNEL_CACHE kernel_cache;
00933 #endif //USE_SVMLIGHT
00934 
00937         KERNELCACHE_ELEM* kernel_matrix;
00938 
00940         CFeatures* lhs;
00942         CFeatures* rhs;
00943 
00945         int32_t num_lhs;
00947         int32_t num_rhs;
00948 
00950         float64_t combined_kernel_weight;
00951 
00953         bool optimization_initialized;
00957         EOptimizationType opt_type;
00958 
00960         uint64_t  properties;
00961 
00964         CKernelNormalizer* normalizer;
00965 };
00966 
00967 }
00968 #endif /* _KERNEL_H__ */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation