10 #include <sys/types.h>
74 :
CFeatures(orig), num_vectors(orig.num_vectors),
75 single_string(orig.single_string),
76 length_of_single_string(orig.length_of_single_string),
77 max_string_length(orig.max_string_length),
78 num_symbols(orig.num_symbols),
79 original_num_symbols(orig.original_num_symbols),
80 order(orig.order), preprocess_on_get(false),
107 for (int32_t i=0; i<256; i++)
117 features(NULL), single_string(NULL), length_of_single_string(0),
118 max_string_length(0), order(0),
119 preprocess_on_get(false), feature_cache(NULL)
139 remove_all_subsets();
143 SG_FREE(single_string);
147 cleanup_feature_vectors(0, num_vectors-1);
161 SG_FREE(symbol_mask_table);
163 symbol_mask_table=NULL;
177 ASSERT(num<get_num_vectors())
181 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
182 SG_FREE(features[real_num].
string);
183 features[real_num].string=NULL;
184 features[real_num].slen=0;
186 determine_maximum_string_length();
192 if (features && get_num_vectors())
194 ASSERT(start<get_num_vectors())
195 ASSERT(stop<get_num_vectors())
197 for (int32_t i=start; i<=stop; i++)
199 int32_t real_num=m_subset_stack->subset_idx_conversion(i);
200 SG_FREE(features[real_num].
string);
201 features[real_num].string=NULL;
202 features[real_num].slen=0;
204 determine_maximum_string_length();
226 if (num>=get_num_vectors())
228 SG_ERROR(
"Index out of bounds (number of strings %d, you "
229 "requested %d)\n", get_num_vectors(), num);
234 ST* vec=get_feature_vector(num, l, free_vec);
235 ST* dst=SG_MALLOC(ST, l);
236 memcpy(dst, vec, l*
sizeof(ST));
237 free_feature_vector(vec, num, free_vec);
245 if (m_subset_stack->has_subsets())
246 SG_ERROR(
"A subset is set, cannot set feature vector\n")
248 if (num>=num_vectors)
250 SG_ERROR(
"Index out of bounds (number of strings %d, you "
251 "requested %d)\n", num_vectors, num);
255 SG_ERROR(
"String has zero or negative length\n")
257 cleanup_feature_vector(num);
258 features[num].slen=vector.
vlen;
259 features[num].string=SG_MALLOC(ST, vector.
vlen);
260 memcpy(features[num].
string, vector.
vector, vector.
vlen*
sizeof(ST));
262 determine_maximum_string_length();
267 preprocess_on_get=
true;
272 preprocess_on_get=
false;
278 if (num>=get_num_vectors())
279 SG_ERROR(
"Requested feature vector with index %d while total num is", num, get_num_vectors())
281 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
283 if (!preprocess_on_get)
286 len=features[real_num].slen;
287 return features[real_num].string;
291 SG_DEBUG(
"computing feature vector!\n")
292 ST* feat=compute_feature_vector(num, len);
295 if (get_num_preprocessors())
297 ST* tmp_feat_before=feat;
299 for (int32_t i=0; i<get_num_preprocessors(); i++)
304 SG_FREE(tmp_feat_before);
305 tmp_feat_before=feat;
328 num_feat=get_num_vectors();
329 num_vec=get_max_vector_length();
330 ASSERT(have_same_length())
332 SG_DEBUG(
"Allocating memory for transposed string features of size %ld\n",
333 int64_t(num_feat)*num_vec);
337 for (int32_t i=0; i<num_vec; i++)
339 sf[i].
string=SG_MALLOC(ST, num_feat);
343 for (int32_t i=0; i<num_feat; i++)
347 ST* vec=get_feature_vector(i, len, free_vec);
349 for (int32_t j=0; j<num_vec; j++)
350 sf[j].
string[i]=vec[j];
352 free_feature_vector(vec, i, free_vec);
359 if (num>=get_num_vectors())
362 "Trying to access string[%d] but num_str=%d\n", num,
366 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
369 feature_cache->unlock_entry(real_num);
377 if (num>=get_num_vectors())
380 "Trying to access string[%d] but num_str=%d\n", num,
384 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
387 feature_cache->unlock_entry(real_num);
392 ASSERT(vec_num<get_num_vectors())
396 ST* vec=get_feature_vector(vec_num, len, free_vec);
398 ST result=vec[feat_num];
399 free_feature_vector(vec, vec_num, free_vec);
406 ASSERT(vec_num<get_num_vectors())
410 ST* vec=get_feature_vector(vec_num, len, free_vec);
411 free_feature_vector(vec, vec_num, free_vec);
417 return max_string_length;
422 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
436 return symbol_mask_table[mask] & symbol;
442 return (offset << (amount*alphabet->get_num_bits()));
448 return (symbol >> (amount*alphabet->get_num_bits()));
454 remove_all_subsets();
456 size_t blocksize=1024*1024;
457 size_t required_blocksize=0;
458 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
459 uint8_t* overflow=NULL;
460 int32_t overflow_len=0;
467 FILE* f=fopen(fname,
"ro");
474 SG_INFO(
"counting line numbers in file %s\n", fname)
476 size_t old_block_offs=0;
477 fseek(f, 0, SEEK_END);
478 size_t fsize=ftell(f);
484 SG_DEBUG(
"block_size=%ld file_size=%ld\n", blocksize, fsize)
487 while (sz == blocksize)
489 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
490 for (
size_t i=0; i<sz; i++)
493 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
496 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs);
497 old_block_offs=block_offs;
500 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t")
503 SG_INFO(
"found %d strings\n", num_vectors)
505 blocksize=required_blocksize;
506 dummy=SG_MALLOC(uint8_t, blocksize);
507 overflow=SG_MALLOC(uint8_t, blocksize);
513 while (sz == blocksize)
515 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
518 for (
size_t i=0; i<sz; i++)
520 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
522 int32_t len=i-old_sz;
524 max_string_length=
CMath::max(max_string_length, len+overflow_len);
526 features[lines].slen=len;
527 features[lines].string=SG_MALLOC(ST, len);
531 for (int32_t j=0; j<overflow_len; j++)
532 features[lines].
string[j]=alpha->
remap_to_bin(overflow[j]);
533 for (int32_t j=0; j<len; j++)
534 features[lines].
string[j+overflow_len]=alpha->
remap_to_bin(dummy[old_sz+j]);
540 for (int32_t j=0; j<overflow_len; j++)
541 features[lines].
string[j]=overflow[j];
542 for (int32_t j=0; j<len; j++)
543 features[lines].
string[j+overflow_len]=dummy[old_sz+j];
554 SG_PROGRESS(lines, 0, num_vectors, 1,
"LOADING:\t")
557 for (
size_t i=old_sz; i<sz; i++)
558 overflow[i-old_sz]=dummy[i];
560 overflow_len=sz-old_sz;
565 SG_INFO(
"file successfully read\n")
566 SG_INFO(
"max_string_length=%d\n", max_string_length)
567 SG_INFO(
"num_strings=%d\n", num_vectors)
581 num_symbols=alphabet->get_num_symbols();
586 remove_all_subsets();
602 if (len>0 && s[0]==
'>')
607 SG_ERROR(
"No fasta hunks (lines starting with '>') found\n")
612 num_symbols=alphabet->get_num_symbols();
625 int32_t spanned_lines=0;
630 SG_ERROR(
"Error reading fasta entry in line %d len=%ld", 4*i+1, len)
632 if (s[0]==
'>' || offs==f.
get_size())
641 len=fasta_len-spanned_lines;
642 strings[i].
string=SG_MALLOC(ST, len);
645 ST* str=strings[i].
string;
647 SG_DEBUG(
"'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len,
id, (int32_t) len, (int32_t) spanned_lines)
649 for (int32_t j=0; j<fasta_len; j++)
656 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
659 if (uint64_t(idx)>=len)
660 SG_ERROR(
"idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
674 return set_features(strings, num, max_len);
678 bool ignore_invalid,
bool bitremap_in_single_string)
680 remove_all_subsets();
692 SG_ERROR(
"Number of lines must be divisible by 4 in fastq files\n")
702 if (bitremap_in_single_string)
705 strings[0].
string=SG_MALLOC(ST, num);
712 original_num_symbols=alphabet->get_num_symbols();
713 str=SG_MALLOC(ST, len);
721 SG_ERROR(
"Error reading 'read' identifier in line %d", 4*i)
725 SG_ERROR(
"Error reading 'read' in line %d len=%ld", 4*i+1, len)
727 if (bitremap_in_single_string)
729 if (len!=(uint64_t) order)
730 SG_ERROR(
"read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
731 for (int32_t j=0; j<order; j++)
732 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
734 strings[0].
string[i]=embed_word(str, order);
738 strings[i].
string=SG_MALLOC(ST, len);
744 for (uint64_t j=0; j<len; j++)
746 if (alphabet->is_valid((uint8_t) s[j]))
754 for (uint64_t j=0; j<len; j++)
762 SG_ERROR(
"Error reading 'read' quality identifier in line %d", 4*i+2)
765 SG_ERROR(
"Error reading 'read' quality in line %d", 4*i+3)
768 if (bitremap_in_single_string)
772 max_string_length=max_len;
780 remove_all_subsets();
782 struct dirent **namelist;
789 n=scandir(dirname, &namelist, &
SGIO::filter, alphasort);
792 SG_ERROR(
"error calling scandir - no files found\n")
806 for (int32_t i=0; i<n; i++)
813 if (!stat(fname, &s) && s.st_size>0)
815 filesize=s.st_size/
sizeof(ST);
817 FILE* f=fopen(fname,
"ro");
820 ST* str=SG_MALLOC(ST, filesize);
821 SG_DEBUG(
"%s:%ld\n", fname, (int64_t) filesize)
822 if (fread(str,
sizeof(ST), filesize, f)!=(
size_t) filesize)
825 strings[num].
slen=filesize;
826 max_len=
CMath::max(max_len, strings[num].slen);
833 SG_ERROR(
"empty or non readable file \'%s\'\n", fname)
835 SG_FREE(namelist[i]);
839 if (num>0 && strings)
841 set_features(strings, num, max_len);
855 if (m_subset_stack->has_subsets())
856 SG_ERROR(
"Cannot call set_features() with subset.\n")
863 for (int32_t i=0; i<p_num_vectors; i++)
879 memcpy(features,p_features,
sizeof(
SGString<ST>)*p_num_vectors);
880 num_vectors = p_num_vectors;
881 max_string_length = p_max_string_length;
896 if (m_subset_stack->has_subsets())
897 SG_ERROR(
"Cannot call set_features() with subset.\n")
902 for (int32_t i=0; i<sf_num_str; i++)
905 int32_t length=sf->
features[real_i].slen;
906 new_features[i].
string=SG_MALLOC(ST, length);
907 memcpy(new_features[i].
string, sf->
features[real_i].string, length);
908 new_features[i].
slen=length;
910 return append_features(new_features, sf_num_str,
916 if (m_subset_stack->has_subsets())
917 SG_ERROR(
"Cannot call set_features() with subset.\n")
920 return set_features(p_features, p_num_vectors, p_max_string_length);
925 for (int32_t i=0; i<p_num_vectors; i++)
934 for (int32_t i=0; i<p_num_vectors; i++)
935 alphabet->add_string_to_histogram( p_features[i].
string, p_features[i].
slen);
937 int32_t old_num_vectors=num_vectors;
938 num_vectors=old_num_vectors+p_num_vectors;
941 for (int32_t i=0; i<num_vectors; i++)
943 if (i<old_num_vectors)
945 new_features[i].
string=features[i].string;
946 new_features[i].
slen=features[i].slen;
950 new_features[i].
string=p_features[i-old_num_vectors].
string;
951 new_features[i].
slen=p_features[i-old_num_vectors].
slen;
957 this->features=new_features;
958 max_string_length=
CMath::max(max_string_length, p_max_string_length);
977 if (m_subset_stack->has_subsets())
978 SG_ERROR(
"get features() is not possible on subset")
981 max_str_len=max_string_length;
989 num_str=get_num_vectors();
990 max_str_len=max_string_length;
993 for (int32_t i=0; i<num_str; i++)
997 ST* vec=get_feature_vector(i, len, free_vec);
998 new_feat[i].
string=SG_MALLOC(ST, len);
999 new_feat[i].
slen=len;
1000 memcpy(new_feat[i].
string, vec, ((
size_t) len) *
sizeof(ST));
1001 free_feature_vector(vec, i, free_vec);
1010 int32_t max_str_len;
1011 *dst=copy_features(num_vec, max_str_len);
1017 remove_all_subsets();
1021 if (!(file=fopen(src,
"r")))
1027 if (fread(&
id[0],
sizeof(
char), 1, file)!=1)
1030 if (fread(&
id[1],
sizeof(
char), 1, file)!=1)
1033 if (fread(&
id[2],
sizeof(
char), 1, file)!=1)
1036 if (fread(&
id[3],
sizeof(
char), 1, file)!=1)
1042 if (fread(&c,
sizeof(uint8_t), 1, file)!=1)
1043 SG_ERROR(
"failed to read compression type")
1048 if (fread(&a,
sizeof(uint8_t), 1, file)!=1)
1049 SG_ERROR(
"failed to read compression alphabet")
1052 if (fread(&num_vectors,
sizeof(int32_t), 1, file)!=1)
1053 SG_ERROR(
"failed to read compression number of vectors")
1056 if (fread(&max_string_length,
sizeof(int32_t), 1, file)!=1)
1057 SG_ERROR(
"failed to read maximum string length")
1058 ASSERT(max_string_length>0)
1063 for (int32_t i=0; i<num_vectors; i++)
1066 int32_t len_compressed;
1067 if (fread(&len_compressed,
sizeof(int32_t), 1, file)!=1)
1068 SG_ERROR(
"failed to read vector length compressed")
1070 int32_t len_uncompressed;
1071 if (fread(&len_uncompressed,
sizeof(int32_t), 1, file)!=1)
1072 SG_ERROR(
"failed to read vector length uncompressed")
1077 features[i].string=SG_MALLOC(ST, len_uncompressed);
1078 features[i].slen=len_uncompressed;
1079 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1080 if (fread(compressed,
sizeof(uint8_t), len_compressed, file)!=(
size_t) len_compressed)
1081 SG_ERROR(
"failed to read compressed data (expected %d bytes)", len_compressed)
1082 uint64_t uncompressed_size=len_uncompressed;
1083 uncompressed_size*=
sizeof(ST);
1084 compressor->
decompress(compressed, len_compressed,
1085 (uint8_t*) features[i].
string, uncompressed_size);
1086 SG_FREE(compressed);
1087 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*
sizeof(ST))
1091 int32_t offs=
CMath::ceil(2.0*
sizeof(int32_t)/
sizeof(ST));
1092 features[i].string=SG_MALLOC(ST, len_compressed+offs);
1093 features[i].slen=len_compressed+offs;
1094 int32_t* feat32ptr=((int32_t*) (features[i].
string));
1095 memset(features[i].
string, 0, offs*
sizeof(ST));
1096 feat32ptr[0]=(int32_t) len_compressed;
1097 feat32ptr[1]=(int32_t) len_uncompressed;
1098 uint8_t* compressed=(uint8_t*) (&features[i].
string[offs]);
1099 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1100 SG_ERROR(
"failed to read uncompressed data")
1112 if (m_subset_stack->has_subsets())
1113 SG_ERROR(
"save_compressed() is not possible on subset")
1117 if (!(file=fopen(dest,
"wb")))
1123 const char*
id=
"SGV0";
1124 fwrite(&
id[0],
sizeof(
char), 1, file);
1125 fwrite(&
id[1],
sizeof(
char), 1, file);
1126 fwrite(&
id[2],
sizeof(
char), 1, file);
1127 fwrite(&
id[3],
sizeof(
char), 1, file);
1130 uint8_t c=(uint8_t) compression;
1131 fwrite(&c,
sizeof(uint8_t), 1, file);
1133 uint8_t a=(uint8_t) alphabet->get_alphabet();
1134 fwrite(&a,
sizeof(uint8_t), 1, file);
1136 fwrite(&num_vectors,
sizeof(int32_t), 1, file);
1138 fwrite(&max_string_length,
sizeof(int32_t), 1, file);
1141 for (int32_t i=0; i<num_vectors; i++)
1145 ST* vec=get_feature_vector(i, len, vfree);
1147 uint8_t* compressed=NULL;
1148 uint64_t compressed_size=0;
1150 compressor->
compress((uint8_t*) vec, ((uint64_t) len)*
sizeof(ST),
1151 compressed, compressed_size, level);
1153 int32_t len_compressed=(int32_t) compressed_size;
1155 fwrite(&len_compressed,
sizeof(int32_t), 1, file);
1157 fwrite(&len,
sizeof(int32_t), 1, file);
1159 fwrite(compressed, compressed_size, 1, file);
1160 SG_FREE(compressed);
1162 free_feature_vector(vec, i, vfree);
1172 SG_DEBUG(
"force: %d\n", force_preprocessing)
1174 for (int32_t i=0; i<get_num_preprocessors(); i++)
1176 if ( (!is_preprocessed(i) || force_preprocessing) )
1178 set_preprocessed(i);
1196 if (m_subset_stack->has_subsets())
1201 ASSERT(num_vectors==1 || single_string)
1202 ASSERT(max_string_length>=window_size ||
1203 (single_string && length_of_single_string>=window_size));
1208 num_vectors= (length_of_single_string-window_size)/step_size + 1;
1209 else if (num_vectors==1)
1211 num_vectors= (max_string_length-window_size)/step_size + 1;
1212 length_of_single_string=max_string_length;
1217 for (int32_t i=0; i<num_vectors; i++)
1219 f[i].
string=&features[0].string[offs+skip];
1220 f[i].
slen=window_size-skip;
1223 single_string=features[0].string;
1226 max_string_length=window_size-skip;
1234 if (m_subset_stack->has_subsets())
1239 ASSERT(num_vectors==1 || single_string)
1240 ASSERT(max_string_length>=window_size ||
1241 (single_string && length_of_single_string>=window_size));
1251 len=length_of_single_string;
1254 single_string=features[0].string;
1255 len=max_string_length;
1256 length_of_single_string=max_string_length;
1260 for (int32_t i=0; i<num_vectors; i++)
1264 if (p>=0 && p<=len-window_size)
1266 f[i].
string=&features[0].string[p+skip];
1267 f[i].
slen=window_size-skip;
1272 max_string_length=len;
1273 features[0].slen=len;
1276 SG_ERROR(
"window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1277 window_size, i, p, len);
1284 max_string_length=window_size-skip;
1291 return obtain_from_char_features(sf, start, p_order, gap, rev);
1298 if (len!=max_string_length)
1301 len=max_string_length;
1303 index_t num_str=get_num_vectors();
1304 for (int32_t i=0; i<num_str; i++)
1306 if (get_vector_length(i)!=len)
1315 if (m_subset_stack->has_subsets())
1318 ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
1321 original_num_symbols=alphabet->get_num_symbols();
1322 int32_t max_val=alphabet->get_num_bits();
1327 num_symbols=original_num_symbols;
1329 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
1332 SG_WARNING(
"symbols did not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val)
1335 for (int32_t i=0; i<p_order*max_val; i++)
1336 mask= (mask<<1) | ((ST) 1);
1338 for (int32_t i=0; i<num_vectors; i++)
1340 int32_t len=features[i].slen;
1343 SG_ERROR(
"Sequence must be longer than order (%d vs. %d)\n", len, p_order)
1345 ST* str=features[i].string;
1348 for (int32_t j=0; j<p_order; j++)
1349 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1350 str[0]=embed_word(&str[0], p_order);
1354 for (int32_t j=p_order; j<len; j++)
1356 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1357 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1361 features[i].slen=len-p_order+1;
1364 compute_symbol_mask_table(max_val);
1369 if (m_subset_stack->has_subsets())
1372 SG_FREE(symbol_mask_table);
1373 symbol_mask_table=SG_MALLOC(ST, 256);
1374 symbol_mask_table_len=256;
1377 for (int32_t i=0; i< (int64_t) max_val; i++)
1380 for (int32_t i=0; i<256; i++)
1382 uint8_t bits=(uint8_t) i;
1383 symbol_mask_table[i]=0;
1385 for (int32_t j=0; j<8; j++)
1388 symbol_mask_table[i]|=mask<<(max_val*j);
1397 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1400 for (uint32_t i=0; i<nbits; i++)
1401 mask=(mask<<1) | (ST) 1;
1403 for (int32_t i=0; i<len; i++)
1406 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1414 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1415 for (int32_t i=0; i<len; i++)
1426 max_string_length=0;
1427 index_t num_str=get_num_vectors();
1429 for (int32_t i=0; i<num_str; i++)
1431 max_string_length=
CMath::max(max_string_length,
1432 features[m_subset_stack->subset_idx_conversion(i)].slen);
1439 ST* s=SG_MALLOC(ST, l+1);
1440 memcpy(s, str.
string,
sizeof(ST)*l);
1448 ASSERT(num<get_num_vectors())
1450 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1453 features[real_num].slen=len ;
1454 features[real_num].string=string ;
1456 max_string_length=
CMath::max(len, max_string_length);
1461 int32_t nsym=get_num_symbols();
1462 int32_t slen=get_max_vector_length();
1463 int64_t sz=int64_t(nsym)*slen*
sizeof(
float64_t);
1468 memset(h_normalizer, 0, slen*
sizeof(
float64_t));
1469 int32_t num_str=get_num_vectors();
1470 for (int32_t i=0; i<num_str; i++)
1474 ST* vec=get_feature_vector(i, len, free_vec);
1475 for (int32_t j=0; j<len; j++)
1477 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1480 free_feature_vector(vec, i, free_vec);
1485 for (int32_t i=0; i<slen; i++)
1487 for (int32_t j=0; j<nsym; j++)
1489 if (h_normalizer && h_normalizer[i])
1490 h[int64_t(i)*nsym+j]/=h_normalizer[i];
1494 SG_FREE(h_normalizer);
1503 ASSERT(rows == get_num_symbols())
1508 for (int32_t i=0; i<num_vec; i++)
1510 sf[i].
string=SG_MALLOC(ST, cols);
1515 for (int32_t j=0; j<cols; j++)
1520 for (c=0; c<rows-1; c++)
1522 if (randoms[j]<=lik)
1524 lik+=hist[int64_t(j)*rows+c+1];
1526 sf[i].
string[j]=alphabet->remap_to_char(c);
1530 set_features(sf, num_vec, cols);
1613 index_t real_idx=m_subset_stack->subset_idx_conversion(indices.
vector[i]);
1619 current_string.
slen*
sizeof(ST));
1620 list_copy.
strings[i]=string_copy;
1630 result->
order=order;
1641 determine_maximum_string_length();
1646 ASSERT(features && num<get_num_vectors())
1648 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1650 len=features[real_num].slen;
1654 ST* target=SG_MALLOC(ST, len);
1655 memcpy(target, features[real_num].
string, len*
sizeof(ST));
1667 length_of_single_string=0;
1668 max_string_length=0;
1670 preprocess_on_get=
false;
1672 symbol_mask_table=NULL;
1673 symbol_mask_table_len=0;
1675 original_num_symbols=0;
1677 m_parameters->add((
CSGObject**) &alphabet,
"alphabet");
1678 m_parameters->add_vector(&features, &num_vectors,
"features",
1679 "This contains the array of features.");
1680 m_parameters->add_vector(&single_string,
1681 &length_of_single_string,
1683 "Created by sliding window.");
1684 m_parameters->add(&max_string_length,
"max_string_length",
1685 "Length of longest string.");
1686 m_parameters->add(&num_symbols,
"num_symbols",
1687 "Number of used symbols.");
1688 m_parameters->add(&original_num_symbols,
"original_num_symbols",
1689 "Original number of used symbols.");
1690 m_parameters->add(&order,
"order",
1691 "Order used in higher order mapping.");
1692 m_parameters->add(&preprocess_on_get,
"preprocess_on_get",
1693 "Preprocess on-the-fly?");
1695 m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len,
"mask_table",
"Symbol mask table - using in higher order mapping");
1914 #define LOAD(f_load, sg_type) \
1915 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1917 SG_INFO("loading...\n") \
1920 SGString<sg_type>* strs; \
1923 loader->f_load(strs, num_str, max_len); \
1924 set_features(strs, num_str, max_len); \
1928 LOAD(get_string_list,
bool)
1929 LOAD(get_string_list,
char)
1930 LOAD(get_string_list, int8_t)
1931 LOAD(get_string_list, uint8_t)
1932 LOAD(get_string_list, int16_t)
1933 LOAD(get_string_list, uint16_t)
1934 LOAD(get_string_list, int32_t)
1935 LOAD(get_string_list, uint32_t)
1936 LOAD(get_string_list, int64_t)
1937 LOAD(get_string_list, uint64_t)
1943 #define SAVE(f_write, sg_type) \
1944 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1946 if (m_subset_stack->has_subsets()) \
1947 SG_ERROR("save() is not possible on subset") \
1950 writer->f_write(features, num_vectors); \
1954 SAVE(set_string_list,
bool)
1955 SAVE(set_string_list,
char)
1956 SAVE(set_string_list, int8_t)
1957 SAVE(set_string_list, uint8_t)
1958 SAVE(set_string_list, int16_t)
1959 SAVE(set_string_list, uint16_t)
1960 SAVE(set_string_list, int32_t)
1961 SAVE(set_string_list, uint32_t)
1962 SAVE(set_string_list, int64_t)
1963 SAVE(set_string_list, uint64_t)
1969 template <
class ST>
template <
class CT>
1971 int32_t p_order, int32_t gap,
bool rev)
1973 remove_all_subsets();
1979 this->order=p_order;
1990 for (int32_t i=0; i<num_vectors; i++)
1997 features[i].string=SG_MALLOC(ST, len);
1998 features[i].slen=len;
2000 ST* str=features[i].string;
2001 for (int32_t j=0; j<len; j++)
2013 num_symbols=original_num_symbols;
2014 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
2018 SG_ERROR(
"symbol does not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val)
2022 SG_DEBUG(
"translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap,
sizeof(ST))
2023 for (int32_t line=0; line<num_vectors; line++)
2027 ST* fv=get_feature_vector(line, len, vfree);
2031 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2033 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2036 features[line].slen-=start+gap ;
2037 if (features[line].slen<0)
2038 features[line].slen=0 ;
2041 compute_symbol_mask_table(max_val);
void disable_on_the_fly_preprocessing()
CSubsetStack * m_subset_stack
int32_t get_num_symbols_in_histogram()
void determine_maximum_string_length()
static void random_vector(T *vec, int32_t len, T min_value, T max_value)
bool load_fasta_file(const char *fname, bool ignore_invalid=false)
void set_feature_vector(SGVector< ST > vector, int32_t num)
void set_features(SGStringList< ST > feats)
virtual CFeatures * duplicate() const
static float64_t ceil(float64_t d)
virtual EFeatureType get_feature_type() const
virtual SGString< ST > * copy_features(int32_t &num_str, int32_t &max_str_len)
EAlphabet
Alphabet of charfeatures/observations.
int32_t get_max_value_in_histogram()
bool check_alphabet_size(bool print_error=true)
SGString< ST > * features
ST shift_offset(ST offset, int32_t amount)
#define SAVE(f_write, sg_type)
virtual const char * get_name() const
return the name of the preprocessor
char * get_line(uint64_t &len, uint64_t &offs)
int32_t get_num_elements() const
#define SG_NOTIMPLEMENTED
virtual void load(CFile *loader)
The class Alphabet implements an alphabet and alphabet utility functions.
ST get_masked_symbols(ST symbol, uint8_t mask)
Compression library for compressing and decompressing buffers using one of the standard compression a...
void compute_symbol_mask_table(int64_t max_val)
floatmax_t get_max_num_symbols()
const T & get_element(int32_t idx1, int32_t idx2=0, int32_t idx3=0) const
floatmax_t num_symbols
number of used symbols
virtual EFeatureClass get_feature_class() const
virtual int32_t get_num_vectors() const
static int filter(CONST_DIRENT_T *d)
virtual void cleanup_feature_vectors(int32_t start, int32_t stop)
uint8_t remap_to_bin(uint8_t c)
EFeatureClass
shogun feature class
int32_t symbol_mask_table_len
order used in higher order mapping
virtual bool save_compressed(char *dest, E_COMPRESSION_TYPE compression, int level)
virtual int32_t get_vector_length(int32_t vec_num)
void enable_on_the_fly_preprocessing()
int32_t order
order used in higher order mapping
void add_string_to_histogram(T *p, int64_t len)
bool obtain_from_char(CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
Template class StringFeatures implements a list of strings.
Class SGObject is the base class of all shogun objects.
ST shift_symbol(ST symbol, int32_t amount)
#define LOAD(f_load, sg_type)
virtual void get_histogram(float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true)
int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
floatmax_t get_original_num_symbols()
virtual ST * apply_to_string(ST *f, int32_t &len)=0
apply preproc on single feature vector
int32_t get_num_symbols() const
A File access base class.
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
void load_ascii_file(char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
virtual ~CStringFeatures()
index_t max_string_length
index_t subset_idx_conversion(index_t idx) const
SGVector< ST > get_feature_vector(int32_t num)
static T max(T a, T b)
return the maximum of two integers
SGStringList< ST > get_features()
floatmax_t get_num_symbols()
Template class StringPreprocessor, base class for preprocessors (cf. CPreprocessor) that apply to CSt...
int32_t obtain_by_position_list(int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0)
virtual void create_random(float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec)
bool check_alphabet(bool print_error=true)
virtual CFeatures * copy_subset(SGVector< index_t > indices)
int32_t get_num_bits() const
EFeatureType
shogun feature type
void compress(uint8_t *uncompressed, uint64_t uncompressed_size, uint8_t *&compressed, uint64_t &compressed_size, int32_t level=1)
void unembed_word(ST word, uint8_t *seq, int32_t len)
bool append_features(CStringFeatures< ST > *sf)
static void set_dirname(const char *dirname)
ST * symbol_mask_table
order used in higher order mapping
bool load_fastq_file(const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false)
virtual void subset_changed_post()
The class Features is the base class of all feature objects.
void decompress(uint8_t *compressed, uint64_t compressed_size, uint8_t *uncompressed, uint64_t &uncompressed_size)
int32_t max_string_length
virtual int32_t get_max_vector_length()
CAlphabet * get_alphabet()
static char * concat_filename(const char *filename)
floatmax_t original_num_symbols
original number of used symbols (before higher order mapping)
virtual void cleanup_feature_vector(int32_t num)
virtual bool load_compressed(char *src, bool decompress)
static ST * get_zero_terminated_string_copy(SGString< ST > str)
static floatmax_t powl(floatmax_t x, floatmax_t n)
x^n
virtual bool apply_to_string_features(CFeatures *f)=0
virtual ST get_feature(int32_t vec_num, int32_t feat_num)
void embed_features(int32_t p_order)
bool have_same_length(int32_t len=-1)
ST embed_word(ST *seq, int32_t len)
virtual ST * compute_feature_vector(int32_t num, int32_t &len)
bool load_from_directory(char *dirname)
CStringFeatures< ST > * get_transposed()
virtual bool apply_preprocessor(bool force_preprocessing=false)
template class SGStringList