|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Berlin Institute of Technology 00009 */ 00010 00011 #include "lib/common.h" 00012 #include "lib/io.h" 00013 #include "kernel/SNPStringKernel.h" 00014 #include "kernel/SqrtDiagKernelNormalizer.h" 00015 #include "features/Features.h" 00016 #include "features/StringFeatures.h" 00017 00018 using namespace shogun; 00019 00020 CSNPStringKernel::CSNPStringKernel(int32_t size, 00021 int32_t degree, int32_t win_len, bool inhomogene) 00022 : CStringKernel<char>(size), 00023 m_degree(degree), m_win_len(2*win_len), m_inhomogene(inhomogene) 00024 { 00025 m_str_min=NULL; 00026 m_str_maj=NULL; 00027 set_normalizer(new CSqrtDiagKernelNormalizer()); 00028 } 00029 00030 CSNPStringKernel::CSNPStringKernel( 00031 CStringFeatures<char>* l, CStringFeatures<char>* r, 00032 int32_t degree, int32_t win_len, bool inhomogene) 00033 : CStringKernel<char>(10), m_degree(degree), m_win_len(2*win_len), 00034 m_inhomogene(inhomogene) 00035 { 00036 m_str_min=NULL; 00037 m_str_maj=NULL; 00038 set_normalizer(new CSqrtDiagKernelNormalizer()); 00039 if (l==r) 00040 obtain_base_strings(); 00041 init(l, r); 00042 } 00043 00044 CSNPStringKernel::~CSNPStringKernel() 00045 { 00046 cleanup(); 00047 } 00048 00049 bool CSNPStringKernel::init(CFeatures* l, CFeatures* r) 00050 { 00051 CStringKernel<char>::init(l, r); 00052 return init_normalizer(); 00053 } 00054 00055 void CSNPStringKernel::cleanup() 00056 { 00057 CKernel::cleanup(); 00058 free(m_str_min); 00059 free(m_str_maj); 00060 } 00061 00062 void CSNPStringKernel::obtain_base_strings() 00063 { 00064 //should only be called on training data 00065 ASSERT(lhs==rhs); 00066 00067 m_str_len=0; 00068 00069 for (int32_t i=0; i<num_lhs; i++) 00070 { 00071 int32_t len; 00072 bool free_vec; 00073 char* vec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, len, free_vec); 00074 00075 if (m_str_len==0) 00076 { 00077 m_str_len=len; 00078 size_t tlen=(len+1)*sizeof(char); 00079 m_str_min=(char*) malloc(tlen); 00080 m_str_maj=(char*) malloc(tlen); 00081 memset(m_str_min, 0, tlen); 00082 memset(m_str_maj, 0, tlen); 00083 } 00084 else 00085 { 00086 ASSERT(m_str_len==len); 00087 } 00088 00089 for (int32_t j=0; j<len; j++) 00090 { 00091 // skip sequencing errors 00092 if (vec[j]=='0') 00093 continue; 00094 00095 if (m_str_min[j]==0) 00096 m_str_min[j]=vec[j]; 00097 else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j]) 00098 m_str_maj[j]=vec[j]; 00099 } 00100 00101 ((CStringFeatures<char>*) lhs)->free_feature_vector(vec, i, free_vec); 00102 } 00103 00104 for (int32_t j=0; j<m_str_len; j++) 00105 { 00106 // if only one one symbol occurs use 0 00107 if (m_str_min[j]==0) 00108 m_str_min[j]='0'; 00109 if (m_str_maj[j]==0) 00110 m_str_maj[j]='0'; 00111 00112 if (m_str_min[j]>m_str_maj[j]) 00113 CMath::swap(m_str_min[j], m_str_maj[j]); 00114 } 00115 } 00116 00117 float64_t CSNPStringKernel::compute(int32_t idx_a, int32_t idx_b) 00118 { 00119 int32_t alen, blen; 00120 bool free_avec, free_bvec; 00121 00122 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00123 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00124 00125 ASSERT(alen==blen); 00126 if (alen!=m_str_len) 00127 SG_ERROR("alen (%d) !=m_str_len (%d)\n", alen, m_str_len); 00128 ASSERT(m_str_min); 00129 ASSERT(m_str_maj); 00130 00131 float64_t total=0; 00132 int32_t inhomogene= (m_inhomogene) ? 1 : 0; 00133 00134 for (int32_t i = 0; i<alen-1; i+=2) 00135 { 00136 int32_t sumaa=0; 00137 int32_t sumbb=0; 00138 int32_t sumab=0; 00139 00140 for (int32_t l=0; l<m_win_len && i+l<alen-1; l+=2) 00141 { 00142 char a1=avec[i+l]; 00143 char a2=avec[i+l+1]; 00144 char b1=bvec[i+l]; 00145 char b2=bvec[i+l+1]; 00146 00147 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0')) 00148 sumab++; 00149 else if (a1==a2 && b1==b2) 00150 { 00151 if (a1!=b1) 00152 continue; 00153 00154 if (a1==m_str_min[i+l]) 00155 sumaa++; 00156 else if (a1==m_str_maj[i+l]) 00157 sumbb++; 00158 else 00159 { 00160 SG_ERROR("The impossible happened i=%d l=%d a1=%c " 00161 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, l, a1,a2, b1,b2, m_str_min[i+l], m_str_maj[i+l]); 00162 } 00163 } 00164 00165 } 00166 total+=CMath::pow(float64_t(sumaa+sumbb+sumab+inhomogene), 00167 (int32_t) m_degree); 00168 } 00169 00170 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00171 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00172 return total; 00173 }