|
SHOGUN v0.9.3
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 00011 #include "lib/File.h" 00012 #include "features/SparseFeatures.h" 00013 #include "lib/BinaryFile.h" 00014 00015 using namespace shogun; 00016 00017 CBinaryFile::CBinaryFile(FILE* f, const char* name) : CFile(f, name) 00018 { 00019 } 00020 00021 CBinaryFile::CBinaryFile(char* fname, char rw, const char* name) : CFile(fname, rw, name) 00022 { 00023 } 00024 00025 CBinaryFile::~CBinaryFile() 00026 { 00027 } 00028 00029 #define GET_VECTOR(fname, sg_type, datatype) \ 00030 void CBinaryFile::fname(sg_type*& vec, int32_t& len) \ 00031 { \ 00032 if (!file) \ 00033 SG_ERROR("File invalid.\n"); \ 00034 SGDataType dtype=read_header(); \ 00035 if (dtype!=datatype) \ 00036 SG_ERROR("Datatype mismatch\n"); \ 00037 \ 00038 if (fread(&len, sizeof(int32_t), 1, file)!=1) \ 00039 SG_ERROR("Failed to read vector length\n"); \ 00040 vec=new sg_type[len]; \ 00041 if (fread(vec, sizeof(sg_type), len, file)!=(size_t) len) \ 00042 SG_ERROR("Failed to read Matrix\n"); \ 00043 } 00044 00045 GET_VECTOR(get_byte_vector, uint8_t, DT_VECTOR_BYTE) 00046 GET_VECTOR(get_char_vector, char, DT_VECTOR_CHAR) 00047 GET_VECTOR(get_int_vector, int32_t, DT_VECTOR_INT) 00048 GET_VECTOR(get_shortreal_vector, float32_t, DT_VECTOR_SHORTREAL) 00049 GET_VECTOR(get_real_vector, float64_t, DT_VECTOR_REAL) 00050 GET_VECTOR(get_short_vector, int16_t, DT_VECTOR_SHORT) 00051 GET_VECTOR(get_word_vector, uint16_t, DT_VECTOR_WORD) 00052 #undef GET_VECTOR 00053 00054 #define GET_MATRIX(fname, sg_type, datatype) \ 00055 void CBinaryFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00056 { \ 00057 if (!file) \ 00058 SG_ERROR("File invalid.\n"); \ 00059 SGDataType dtype=read_header(); \ 00060 if (dtype!=datatype) \ 00061 SG_ERROR("Datatype mismatch\n"); \ 00062 \ 00063 if (fread(&num_feat, sizeof(int32_t), 1, file)!=1 || \ 00064 fread(&num_vec, sizeof(int32_t), 1, file)!=1) \ 00065 SG_ERROR("Failed to read Matrix dimensions\n"); \ 00066 matrix=new sg_type[int64_t(num_feat)*num_vec]; \ 00067 if (fread(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec) \ 00068 SG_ERROR("Failed to read Matrix\n"); \ 00069 } 00070 00071 GET_MATRIX(get_char_matrix, char, DT_DENSE_CHAR) 00072 GET_MATRIX(get_byte_matrix, uint8_t, DT_DENSE_BYTE) 00073 GET_MATRIX(get_int_matrix, int32_t, DT_DENSE_INT) 00074 GET_MATRIX(get_uint_matrix, uint32_t, DT_DENSE_UINT) 00075 GET_MATRIX(get_long_matrix, int64_t, DT_DENSE_LONG) 00076 GET_MATRIX(get_ulong_matrix, uint64_t, DT_DENSE_ULONG) 00077 GET_MATRIX(get_short_matrix, int16_t, DT_DENSE_SHORT) 00078 GET_MATRIX(get_word_matrix, uint16_t, DT_DENSE_WORD) 00079 GET_MATRIX(get_shortreal_matrix, float32_t, DT_DENSE_SHORTREAL) 00080 GET_MATRIX(get_real_matrix, float64_t, DT_DENSE_REAL) 00081 GET_MATRIX(get_longreal_matrix, floatmax_t, DT_DENSE_LONGREAL) 00082 #undef GET_MATRIX 00083 00084 void CBinaryFile::get_byte_ndarray(uint8_t*& array, int32_t*& dims, int32_t& num_dims) 00085 { 00086 } 00087 00088 void CBinaryFile::get_char_ndarray(char*& array, int32_t*& dims, int32_t& num_dims) 00089 { 00090 } 00091 00092 void CBinaryFile::get_int_ndarray(int32_t*& array, int32_t*& dims, int32_t& num_dims) 00093 { 00094 } 00095 00096 void CBinaryFile::get_shortreal_ndarray(float32_t*& array, int32_t*& dims, int32_t& num_dims) 00097 { 00098 } 00099 00100 void CBinaryFile::get_real_ndarray(float64_t*& array, int32_t*& dims, int32_t& num_dims) 00101 { 00102 } 00103 00104 void CBinaryFile::get_short_ndarray(int16_t*& array, int32_t*& dims, int32_t& num_dims) 00105 { 00106 } 00107 00108 void CBinaryFile::get_word_ndarray(uint16_t*& array, int32_t*& dims, int32_t& num_dims) 00109 { 00110 } 00111 00112 #define GET_SPARSEMATRIX(fname, sg_type, datatype) \ 00113 void CBinaryFile::fname(TSparse<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00114 { \ 00115 if (!(file)) \ 00116 SG_ERROR("File invalid.\n"); \ 00117 \ 00118 SGDataType dtype=read_header(); \ 00119 if (dtype!=datatype) \ 00120 SG_ERROR("Datatype mismatch\n"); \ 00121 \ 00122 if (fread(&num_vec, sizeof(int32_t), 1, file)!=1) \ 00123 SG_ERROR("Failed to read number of vectors\n"); \ 00124 \ 00125 matrix=new TSparse<sg_type>[num_vec]; \ 00126 \ 00127 for (int32_t i=0; i<num_vec; i++) \ 00128 { \ 00129 int32_t len=0; \ 00130 if (fread(&len, sizeof(int32_t), 1, file)!=1) \ 00131 SG_ERROR("Failed to read sparse vector length of vector idx=%d\n", i); \ 00132 matrix[i].num_feat_entries=len; \ 00133 TSparseEntry<sg_type>* vec = new TSparseEntry<sg_type>[len]; \ 00134 if (fread(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len) \ 00135 SG_ERROR("Failed to read sparse vector %d\n", i); \ 00136 matrix[i].features=vec; \ 00137 } \ 00138 } 00139 GET_SPARSEMATRIX(get_bool_sparsematrix, bool, DT_SPARSE_BOOL) 00140 GET_SPARSEMATRIX(get_char_sparsematrix, char, DT_SPARSE_CHAR) 00141 GET_SPARSEMATRIX(get_byte_sparsematrix, uint8_t, DT_SPARSE_BYTE) 00142 GET_SPARSEMATRIX(get_int_sparsematrix, int32_t, DT_SPARSE_INT) 00143 GET_SPARSEMATRIX(get_uint_sparsematrix, uint32_t, DT_SPARSE_UINT) 00144 GET_SPARSEMATRIX(get_long_sparsematrix, int64_t, DT_SPARSE_LONG) 00145 GET_SPARSEMATRIX(get_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG) 00146 GET_SPARSEMATRIX(get_short_sparsematrix, int16_t, DT_SPARSE_SHORT) 00147 GET_SPARSEMATRIX(get_word_sparsematrix, uint16_t, DT_SPARSE_WORD) 00148 GET_SPARSEMATRIX(get_shortreal_sparsematrix, float32_t, DT_SPARSE_SHORTREAL) 00149 GET_SPARSEMATRIX(get_real_sparsematrix, float64_t, DT_SPARSE_REAL) 00150 GET_SPARSEMATRIX(get_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL) 00151 #undef GET_SPARSEMATRIX 00152 00153 00154 #define GET_STRING_LIST(fname, sg_type, datatype) \ 00155 void CBinaryFile::fname(T_STRING<sg_type>*& strings, int32_t& num_str, int32_t& max_string_len) \ 00156 { \ 00157 strings=NULL; \ 00158 num_str=0; \ 00159 max_string_len=0; \ 00160 \ 00161 if (!file) \ 00162 SG_ERROR("File invalid.\n"); \ 00163 \ 00164 SGDataType dtype=read_header(); \ 00165 if (dtype!=datatype) \ 00166 SG_ERROR("Datatype mismatch\n"); \ 00167 \ 00168 if (fread(&num_str, sizeof(int32_t), 1, file)!=1) \ 00169 SG_ERROR("Failed to read number of strings\n"); \ 00170 \ 00171 strings=new T_STRING<sg_type>[num_str]; \ 00172 \ 00173 for (int32_t i=0; i<num_str; i++) \ 00174 { \ 00175 int32_t len=0; \ 00176 if (fread(&len, sizeof(int32_t), 1, file)!=1) \ 00177 SG_ERROR("Failed to read string length of string with idx=%d\n", i); \ 00178 strings[i].length=len; \ 00179 sg_type* str = new sg_type[len]; \ 00180 if (fread(str, sizeof(sg_type), len, file)!= (size_t) len) \ 00181 SG_ERROR("Failed to read string %d\n", i); \ 00182 strings[i].string=str; \ 00183 } \ 00184 } 00185 00186 GET_STRING_LIST(get_char_string_list, char, DT_STRING_CHAR) 00187 GET_STRING_LIST(get_byte_string_list, uint8_t, DT_STRING_BYTE) 00188 GET_STRING_LIST(get_int_string_list, int32_t, DT_STRING_INT) 00189 GET_STRING_LIST(get_uint_string_list, uint32_t, DT_STRING_UINT) 00190 GET_STRING_LIST(get_long_string_list, int64_t, DT_STRING_LONG) 00191 GET_STRING_LIST(get_ulong_string_list, uint64_t, DT_STRING_ULONG) 00192 GET_STRING_LIST(get_short_string_list, int16_t, DT_STRING_SHORT) 00193 GET_STRING_LIST(get_word_string_list, uint16_t, DT_STRING_WORD) 00194 GET_STRING_LIST(get_shortreal_string_list, float32_t, DT_STRING_SHORTREAL) 00195 GET_STRING_LIST(get_real_string_list, float64_t, DT_STRING_REAL) 00196 GET_STRING_LIST(get_longreal_string_list, floatmax_t, DT_STRING_LONGREAL) 00197 #undef GET_STRING_LIST 00198 00201 #define SET_VECTOR(fname, sg_type, dtype) \ 00202 void CBinaryFile::fname(const sg_type* vec, int32_t len) \ 00203 { \ 00204 if (!(file && vec)) \ 00205 SG_ERROR("File or vector invalid.\n"); \ 00206 \ 00207 write_header(dtype); \ 00208 \ 00209 if (fwrite(&len, sizeof(int32_t), 1, file)!=1 || \ 00210 fwrite(vec, sizeof(sg_type), len, file)!=(size_t) len) \ 00211 SG_ERROR("Failed to write vector\n"); \ 00212 } 00213 SET_VECTOR(set_byte_vector, uint8_t, DT_VECTOR_BYTE) 00214 SET_VECTOR(set_char_vector, char, DT_VECTOR_CHAR) 00215 SET_VECTOR(set_int_vector, int32_t, DT_VECTOR_INT) 00216 SET_VECTOR(set_shortreal_vector, float32_t, DT_VECTOR_SHORTREAL) 00217 SET_VECTOR(set_real_vector, float64_t, DT_VECTOR_REAL) 00218 SET_VECTOR(set_short_vector, int16_t, DT_VECTOR_SHORT) 00219 SET_VECTOR(set_word_vector, uint16_t, DT_VECTOR_WORD) 00220 #undef SET_VECTOR 00221 00222 #define SET_MATRIX(fname, sg_type, dtype) \ 00223 void CBinaryFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \ 00224 { \ 00225 if (!(file && matrix)) \ 00226 SG_ERROR("File or matrix invalid.\n"); \ 00227 \ 00228 write_header(dtype); \ 00229 \ 00230 if (fwrite(&num_feat, sizeof(int32_t), 1, file)!=1 || \ 00231 fwrite(&num_vec, sizeof(int32_t), 1, file)!=1 || \ 00232 fwrite(matrix, sizeof(sg_type)*num_feat, num_vec, file)!=(size_t) num_vec) \ 00233 SG_ERROR("Failed to write Matrix\n"); \ 00234 } 00235 SET_MATRIX(set_char_matrix, char, DT_DENSE_CHAR) 00236 SET_MATRIX(set_byte_matrix, uint8_t, DT_DENSE_BYTE) 00237 SET_MATRIX(set_int_matrix, int32_t, DT_DENSE_INT) 00238 SET_MATRIX(set_uint_matrix, uint32_t, DT_DENSE_UINT) 00239 SET_MATRIX(set_long_matrix, int64_t, DT_DENSE_LONG) 00240 SET_MATRIX(set_ulong_matrix, uint64_t, DT_DENSE_ULONG) 00241 SET_MATRIX(set_short_matrix, int16_t, DT_DENSE_SHORT) 00242 SET_MATRIX(set_word_matrix, uint16_t, DT_DENSE_WORD) 00243 SET_MATRIX(set_shortreal_matrix, float32_t, DT_DENSE_SHORTREAL) 00244 SET_MATRIX(set_real_matrix, float64_t, DT_DENSE_REAL) 00245 SET_MATRIX(set_longreal_matrix, floatmax_t, DT_DENSE_LONGREAL) 00246 #undef SET_MATRIX 00247 00248 #define SET_SPARSEMATRIX(fname, sg_type, dtype) \ 00249 void CBinaryFile::fname(const TSparse<sg_type>* matrix, \ 00250 int32_t num_feat, int32_t num_vec) \ 00251 { \ 00252 if (!(file && matrix)) \ 00253 SG_ERROR("File or matrix invalid.\n"); \ 00254 \ 00255 write_header(dtype); \ 00256 \ 00257 if (fwrite(&num_vec, sizeof(int32_t), 1, file)!=1) \ 00258 SG_ERROR("Failed to write Sparse Matrix\n"); \ 00259 \ 00260 for (int32_t i=0; i<num_vec; i++) \ 00261 { \ 00262 TSparseEntry<sg_type>* vec = matrix[i].features; \ 00263 int32_t len=matrix[i].num_feat_entries; \ 00264 if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) || \ 00265 (fwrite(vec, sizeof(TSparseEntry<sg_type>), len, file)!= (size_t) len)) \ 00266 SG_ERROR("Failed to write Sparse Matrix\n"); \ 00267 } \ 00268 } 00269 SET_SPARSEMATRIX(set_bool_sparsematrix, bool, DT_SPARSE_BOOL) 00270 SET_SPARSEMATRIX(set_char_sparsematrix, char, DT_SPARSE_CHAR) 00271 SET_SPARSEMATRIX(set_byte_sparsematrix, uint8_t, DT_SPARSE_BYTE) 00272 SET_SPARSEMATRIX(set_int_sparsematrix, int32_t, DT_SPARSE_INT) 00273 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, DT_SPARSE_UINT) 00274 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, DT_SPARSE_LONG) 00275 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, DT_SPARSE_ULONG) 00276 SET_SPARSEMATRIX(set_short_sparsematrix, int16_t, DT_SPARSE_SHORT) 00277 SET_SPARSEMATRIX(set_word_sparsematrix, uint16_t, DT_SPARSE_WORD) 00278 SET_SPARSEMATRIX(set_shortreal_sparsematrix, float32_t, DT_SPARSE_SHORTREAL) 00279 SET_SPARSEMATRIX(set_real_sparsematrix, float64_t, DT_SPARSE_REAL) 00280 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, DT_SPARSE_LONGREAL) 00281 #undef SET_SPARSEMATRIX 00282 00283 #define SET_STRING_LIST(fname, sg_type, dtype) \ 00284 void CBinaryFile::fname(const T_STRING<sg_type>* strings, int32_t num_str) \ 00285 { \ 00286 if (!(file && strings)) \ 00287 SG_ERROR("File or strings invalid.\n"); \ 00288 \ 00289 write_header(dtype); \ 00290 for (int32_t i=0; i<num_str; i++) \ 00291 { \ 00292 int32_t len = strings[i].length; \ 00293 if ((fwrite(&len, sizeof(int32_t), 1, file)!=1) || \ 00294 (fwrite(strings[i].string, sizeof(sg_type), len, file)!= (size_t) len)) \ 00295 SG_ERROR("Failed to write Sparse Matrix\n"); \ 00296 } \ 00297 } 00298 SET_STRING_LIST(set_char_string_list, char, DT_STRING_CHAR) 00299 SET_STRING_LIST(set_byte_string_list, uint8_t, DT_STRING_BYTE) 00300 SET_STRING_LIST(set_int_string_list, int32_t, DT_STRING_INT) 00301 SET_STRING_LIST(set_uint_string_list, uint32_t, DT_STRING_UINT) 00302 SET_STRING_LIST(set_long_string_list, int64_t, DT_STRING_LONG) 00303 SET_STRING_LIST(set_ulong_string_list, uint64_t, DT_STRING_ULONG) 00304 SET_STRING_LIST(set_short_string_list, int16_t, DT_STRING_SHORT) 00305 SET_STRING_LIST(set_word_string_list, uint16_t, DT_STRING_WORD) 00306 SET_STRING_LIST(set_shortreal_string_list, float32_t, DT_STRING_SHORTREAL) 00307 SET_STRING_LIST(set_real_string_list, float64_t, DT_STRING_REAL) 00308 SET_STRING_LIST(set_longreal_string_list, floatmax_t, DT_STRING_LONGREAL) 00309 #undef SET_STRING_LIST 00310 00311 00312 int32_t CBinaryFile::parse_first_header(SGDataType &type) 00313 { 00314 return -1; 00315 } 00316 00317 int32_t CBinaryFile::parse_next_header(SGDataType &type) 00318 { 00319 return -1; 00320 } 00321 00322 00323 SGDataType CBinaryFile::read_header() 00324 { 00325 ASSERT(file); 00326 00327 char fourcc[4]; 00328 uint16_t endian=0; 00329 uint16_t dtype=0; 00330 00331 if (!((fread(&fourcc, sizeof(char), 4, file)==4) && 00332 (fread(&endian, sizeof(uint16_t), 1, file)== 1) && 00333 (fread(&dtype, sizeof(uint16_t), 1, file)== 1))) 00334 SG_ERROR("Error reading header\n"); 00335 00336 if (strncmp(fourcc, "SG00", 4)) 00337 SG_ERROR("Header mismatch, expected SG00\n"); 00338 00339 return (SGDataType) dtype; 00340 } 00341 00342 void CBinaryFile::write_header(SGDataType datatype) 00343 { 00344 ASSERT(file); 00345 00346 const char* fourcc="SG00"; 00347 uint16_t endian=0x1234; 00348 uint16_t dtype=datatype; 00349 00350 if (!((fwrite(fourcc, sizeof(char), 4, file)==4) && 00351 (fwrite(&endian, sizeof(uint16_t), 1, file)==1) && 00352 (fwrite(&dtype, sizeof(uint16_t), 1, file)==1))) 00353 SG_ERROR("Error writing header\n"); 00354 }