18 using namespace shogun;
40 CFile(fname, rw, name)
56 is_data_transposed=value;
63 m_delimiter=delimiter;
71 m_num_to_skip=num_lines;
98 m_line_reader->
reset();
103 void CCSVFile::init()
105 is_data_transposed=
false;
110 m_line_tokenizer=NULL;
115 void CCSVFile::init_with_defaults()
117 is_data_transposed=
false;
135 void CCSVFile::skip_lines(int32_t num_lines)
137 for (int32_t i=0; i<num_lines; i++)
141 #define GET_VECTOR(read_func, sg_type) \
142 void CCSVFile::get_vector(sg_type*& vector, int32_t& len) \
144 if (!m_line_reader->has_next()) \
147 int32_t num_feat=0; \
149 get_matrix(vector, num_feat, num_vec); \
180 #define GET_MATRIX(read_func, sg_type) \
181 void CCSVFile::get_matrix(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \
183 int32_t num_lines=0; \
184 int32_t num_tokens=-1; \
185 int32_t current_line_idx=0; \
186 SGVector<char> line; \
188 skip_lines(m_num_to_skip); \
189 num_lines=get_stats(num_tokens); \
193 matrix=SG_MALLOC(sg_type, num_lines*num_tokens); \
194 skip_lines(m_num_to_skip); \
195 while (m_line_reader->has_next()) \
197 line=m_line_reader->read_line(); \
198 m_parser->set_text(line); \
200 for (int32_t i=0; i<num_tokens; i++) \
202 if (!m_parser->has_next()) \
205 if (!is_data_transposed) \
206 matrix[i+current_line_idx*num_tokens]=m_parser->read_func(); \
208 matrix[current_line_idx+i*num_tokens]=m_parser->read_func(); \
210 current_line_idx++; \
215 if (!is_data_transposed) \
217 num_feat=num_tokens; \
222 num_feat=num_lines; \
223 num_vec=num_tokens; \
241 #define GET_NDARRAY(read_func, sg_type) \
242 void CCSVFile::get_ndarray(sg_type*& array, int32_t*& dims, int32_t& num_dims) \
256 #define GET_SPARSE_MATRIX(read_func, sg_type) \
257 void CCSVFile::get_sparse_matrix( \
258 SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \
276 #undef GET_SPARSE_MATRIX
278 #define SET_VECTOR(format, sg_type) \
279 void CCSVFile::set_vector(const sg_type* vector, int32_t len) \
283 if (!is_data_transposed) \
285 for (int32_t i=0; i<len; i++) \
286 fprintf(file, "%" format "\n", vector[i]); \
291 for (i=0; i<len-1; i++) \
292 fprintf(file, "%" format "%c", vector[i], m_delimiter); \
293 fprintf(file, "%" format "\n", vector[i]); \
313 #define SET_MATRIX(format, sg_type) \
314 void CCSVFile::set_matrix(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \
318 if (!is_data_transposed) \
320 for (int32_t i=0; i<num_vec; i++) \
323 for (j=0; j<num_feat-1; j++) \
324 fprintf(file, "%" format "%c", matrix[j+i*num_feat], m_delimiter); \
325 fprintf(file, "%" format "\n", matrix[j+i*num_feat]); \
330 for (int32_t i=0; i<num_feat; i++) \
333 for (j=0; j<num_vec-1; j++) \
334 fprintf(file, "%" format "%c", matrix[i+j*num_vec], m_delimiter); \
335 fprintf(file, "%" format "\n", matrix[i+j*num_vec]); \
356 #define SET_SPARSE_MATRIX(format, sg_type) \
357 void CCSVFile::set_sparse_matrix( \
358 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \
376 #undef SET_SPARSE_MATRIX
380 int32_t& max_string_len)
383 int32_t current_line_idx=0;
384 int32_t num_tokens=0;
390 skip_lines(m_num_to_skip);
394 strings[current_line_idx].
slen=line.
vlen;
395 strings[current_line_idx].
string=SG_MALLOC(
char, line.
vlen);
396 for (int32_t i=0; i<line.
vlen; i++)
397 strings[current_line_idx].
string[i]=line[i];
399 if (line.
vlen>max_string_len)
400 max_string_len=line.
vlen;
405 num_str=current_line_idx;
408 #define GET_STRING_LIST(sg_type) \
409 void CCSVFile::get_string_list( \
410 SGString<sg_type>*& strings, int32_t& num_str, \
411 int32_t& max_string_len) \
427 #undef GET_STRING_LIST
432 for (int32_t i=0; i<num_str; i++)
434 for (int32_t j=0; j<strings[i].
slen; j++)
435 fprintf(
file,
"%c", strings[i].
string[j]);
440 #define SET_STRING_LIST(sg_type) \
441 void CCSVFile::set_string_list( \
442 const SGString<sg_type>* strings, int32_t num_str) \
458 #undef SET_STRING_LIST
463 char *last = s.
start;
466 if (*s.
start == delim)
void set_delimiter(char delimiter)
#define GET_MATRIX(read_func, sg_type)
void set_transpose(bool value)
Class v_array taken directly from JL's implementation.
#define GET_NDARRAY(read_func, sg_type)
virtual index_t next_token_idx(index_t &start)
virtual void set_string_list(const SGString< uint8_t > *strings, int32_t num_str)
#define GET_SPARSE_MATRIX(read_func, sg_type)
#define SET_STRING_LIST(sg_type)
Class for buffered reading from a ascii file.
struct Substring, specified by start position and end position.
void set_lines_to_skip(int32_t num_lines)
static void tokenize(char delim, substring s, v_array< substring > &ret)
void push(const T &new_elem)
int32_t get_stats(int32_t &num_tokens)
virtual SGVector< char > read_line()
#define GET_STRING_LIST(sg_type)
#define SET_MATRIX(format, sg_type)
Class for reading from a string.
A File access base class.
#define SET_VECTOR(format, sg_type)
#define GET_VECTOR(read_func, sg_type)
#define SET_SPARSE_MATRIX(format, sg_type)
void set_tokenizer(CTokenizer *tokenizer)
The class CDelimiterTokenizer is used to tokenize a SGVector into tokens using custom chars as ...
SGVector< bool > delimiters
virtual void set_text(SGVector< char > txt)
virtual void get_string_list(SGString< uint8_t > *&strings, int32_t &num_str, int32_t &max_string_len)