|
SHOGUN v0.9.3
|
00001 /* 00002 SVM with stochastic gradient 00003 Copyright (C) 2007- Leon Bottou 00004 00005 This program is free software; you can redistribute it and/or 00006 modify it under the terms of the GNU Lesser General Public 00007 License as published by the Free Software Foundation; either 00008 version 2.1 of the License, or (at your option) any later version. 00009 00010 This program is distributed in the hope that it will be useful, 00011 but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 GNU General Public License for more details. 00014 00015 You should have received a copy of the GNU General Public License 00016 along with this program; if not, write to the Free Software 00017 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA 00018 $Id: svmsgd.cpp,v 1.13 2007/10/02 20:40:06 cvs Exp $ 00019 00020 Shogun adjustments (w) 2008-2009 Soeren Sonnenburg 00021 */ 00022 00023 #include "classifier/svm/SVMSGD.h" 00024 #include "lib/Signal.h" 00025 00026 using namespace shogun; 00027 00028 // Available losses 00029 #define HINGELOSS 1 00030 #define SMOOTHHINGELOSS 2 00031 #define SQUAREDHINGELOSS 3 00032 #define LOGLOSS 10 00033 #define LOGLOSSMARGIN 11 00034 00035 // Select loss 00036 #define LOSS HINGELOSS 00037 00038 // One when bias is regularized 00039 #define REGULARIZEBIAS 0 00040 00041 inline 00042 float64_t loss(float64_t z) 00043 { 00044 #if LOSS == LOGLOSS 00045 if (z >= 0) 00046 return log(1+exp(-z)); 00047 else 00048 return -z + log(1+exp(z)); 00049 #elif LOSS == LOGLOSSMARGIN 00050 if (z >= 1) 00051 return log(1+exp(1-z)); 00052 else 00053 return 1-z + log(1+exp(z-1)); 00054 #elif LOSS == SMOOTHHINGELOSS 00055 if (z < 0) 00056 return 0.5 - z; 00057 if (z < 1) 00058 return 0.5 * (1-z) * (1-z); 00059 return 0; 00060 #elif LOSS == SQUAREDHINGELOSS 00061 if (z < 1) 00062 return 0.5 * (1 - z) * (1 - z); 00063 return 0; 00064 #elif LOSS == HINGELOSS 00065 if (z < 1) 00066 return 1 - z; 00067 return 0; 00068 #else 00069 # error "Undefined loss" 00070 #endif 00071 } 00072 00073 inline 00074 float64_t dloss(float64_t z) 00075 { 00076 #if LOSS == LOGLOSS 00077 if (z < 0) 00078 return 1 / (exp(z) + 1); 00079 float64_t ez = exp(-z); 00080 return ez / (ez + 1); 00081 #elif LOSS == LOGLOSSMARGIN 00082 if (z < 1) 00083 return 1 / (exp(z-1) + 1); 00084 float64_t ez = exp(1-z); 00085 return ez / (ez + 1); 00086 #elif LOSS == SMOOTHHINGELOSS 00087 if (z < 0) 00088 return 1; 00089 if (z < 1) 00090 return 1-z; 00091 return 0; 00092 #elif LOSS == SQUAREDHINGELOSS 00093 if (z < 1) 00094 return (1 - z); 00095 return 0; 00096 #else 00097 if (z < 1) 00098 return 1; 00099 return 0; 00100 #endif 00101 } 00102 00103 00104 00105 CSVMSGD::CSVMSGD(float64_t C) 00106 : CLinearClassifier(), t(1), C1(C), C2(C), 00107 wscale(1), bscale(1), epochs(5), skip(1000), count(1000), use_bias(true), 00108 use_regularized_bias(false) 00109 { 00110 } 00111 00112 CSVMSGD::CSVMSGD(float64_t C, CDotFeatures* traindat, CLabels* trainlab) 00113 : CLinearClassifier(), t(1), C1(C), C2(C), wscale(1), bscale(1), 00114 epochs(5), skip(1000), count(1000), use_bias(true), 00115 use_regularized_bias(false) 00116 { 00117 w=NULL; 00118 set_features(traindat); 00119 set_labels(trainlab); 00120 } 00121 00122 CSVMSGD::~CSVMSGD() 00123 { 00124 delete[] w; 00125 w=NULL; 00126 } 00127 00128 bool CSVMSGD::train(CFeatures* data) 00129 { 00130 // allocate memory for w and initialize everyting w and bias with 0 00131 ASSERT(labels); 00132 00133 if (data) 00134 { 00135 if (!data->has_property(FP_DOT)) 00136 SG_ERROR("Specified features are not of type CDotFeatures\n"); 00137 set_features((CDotFeatures*) data); 00138 } 00139 00140 ASSERT(features); 00141 ASSERT(labels->is_two_class_labeling()); 00142 00143 int32_t num_train_labels=labels->get_num_labels(); 00144 w_dim=features->get_dim_feature_space(); 00145 int32_t num_vec=features->get_num_vectors(); 00146 00147 ASSERT(num_vec==num_train_labels); 00148 ASSERT(num_vec>0); 00149 00150 delete[] w; 00151 w=new float64_t[w_dim]; 00152 memset(w, 0, w_dim*sizeof(float64_t)); 00153 bias=0; 00154 00155 float64_t lambda= 1.0/(C1*num_vec); 00156 00157 // Shift t in order to have a 00158 // reasonable initial learning rate. 00159 // This assumes |x| \approx 1. 00160 float64_t maxw = 1.0 / sqrt(lambda); 00161 float64_t typw = sqrt(maxw); 00162 float64_t eta0 = typw / CMath::max(1.0,dloss(-typw)); 00163 t = 1 / (eta0 * lambda); 00164 00165 SG_INFO("lambda=%f, epochs=%d, eta0=%f\n", lambda, epochs, eta0); 00166 00167 00168 //do the sgd 00169 calibrate(); 00170 00171 SG_INFO("Training on %d vectors\n", num_vec); 00172 CSignal::clear_cancel(); 00173 00174 for(int32_t e=0; e<epochs && (!CSignal::cancel_computations()); e++) 00175 { 00176 count = skip; 00177 for (int32_t i=0; i<num_vec; i++) 00178 { 00179 float64_t eta = 1.0 / (lambda * t); 00180 float64_t y = labels->get_label(i); 00181 float64_t z = y * (features->dense_dot(i, w, w_dim) + bias); 00182 00183 #if LOSS < LOGLOSS 00184 if (z < 1) 00185 #endif 00186 { 00187 float64_t etd = eta * dloss(z); 00188 features->add_to_dense_vec(etd * y / wscale, i, w, w_dim); 00189 00190 if (use_bias) 00191 { 00192 if (use_regularized_bias) 00193 bias *= 1 - eta * lambda * bscale; 00194 bias += etd * y * bscale; 00195 } 00196 } 00197 00198 if (--count <= 0) 00199 { 00200 float64_t r = 1 - eta * lambda * skip; 00201 if (r < 0.8) 00202 r = pow(1 - eta * lambda, skip); 00203 CMath::scale_vector(r, w, w_dim); 00204 count = skip; 00205 } 00206 t++; 00207 } 00208 } 00209 00210 float64_t wnorm = CMath::dot(w,w, w_dim); 00211 SG_INFO("Norm: %.6f, Bias: %.6f\n", wnorm, bias); 00212 00213 return true; 00214 } 00215 00216 void CSVMSGD::calibrate() 00217 { 00218 ASSERT(features); 00219 int32_t num_vec=features->get_num_vectors(); 00220 int32_t c_dim=features->get_dim_feature_space(); 00221 00222 ASSERT(num_vec>0); 00223 ASSERT(c_dim>0); 00224 00225 float64_t* c=new float64_t[c_dim]; 00226 memset(c, 0, c_dim*sizeof(float64_t)); 00227 00228 SG_INFO("Estimating sparsity and bscale num_vec=%d num_feat=%d.\n", num_vec, c_dim); 00229 00230 // compute average gradient size 00231 int32_t n = 0; 00232 float64_t m = 0; 00233 float64_t r = 0; 00234 00235 for (int32_t j=0; j<num_vec && m<=1000; j++, n++) 00236 { 00237 r += features->get_nnz_features_for_vector(j); 00238 features->add_to_dense_vec(1, j, c, c_dim, true); 00239 00240 //waste cpu cycles for readability 00241 //(only changed dims need checking) 00242 m=CMath::max(c, c_dim); 00243 } 00244 00245 // bias update scaling 00246 bscale = m/n; 00247 00248 // compute weight decay skip 00249 skip = (int32_t) ((16 * n * c_dim) / r); 00250 SG_INFO("using %d examples. skip=%d bscale=%.6f\n", n, skip, bscale); 00251 00252 delete[] c; 00253 } 00254