47 #include <sphinxbase/fe.h>
54 #include <sphinxbase/byteorder.h>
57 #include "sphinx_wave2feat.h"
58 #include "cmd_ln_defn.h"
95 int32 RemainingLength;
117 if ((fh = fopen(wtf->
infile,
"rb")) == NULL) {
121 if (fread(&hdr,
sizeof(hdr), 1, fh) != 1) {
122 E_ERROR(
"Failed to read RIFF header");
127 if (0 != memcmp(hdr.rifftag,
"RIFF", 4)) {
131 if (cmd_ln_int32_r(wtf->
config,
"-nchans") != hdr.numchannels) {
132 E_ERROR(
"Number of channels %d does not match configured value in file '%s'\n", hdr.numchannels, wtf->
infile);
136 if (cmd_ln_float32_r(wtf->
config,
"-samprate") != hdr.SamplingFreq) {
137 E_ERROR(
"Sample rate %.1f does not match configured value in file '%s'\n", hdr.SamplingFreq, wtf->
infile);
147 open_nist_file(
sphinx_wave2feat_t *wtf,
char const *infile, FILE **out_fh,
int detect_endian)
153 if ((fh = fopen(infile,
"rb")) == NULL) {
157 if (fread(&nist, 1, 7, fh) != 7) {
163 if (0 != strncmp(nist,
"NIST_1A", 7)) {
168 fseek(fh, 0, SEEK_SET);
174 if (strlen(li->buf) == 0) {
181 words = (
char **)
ckd_calloc(nword,
sizeof(*words));
183 if (0 == strcmp(words[0],
"sample_rate")) {
184 float samprate =
atof_c(words[2]);
185 if (cmd_ln_float32_r(wtf->
config,
"-samprate") != samprate) {
186 E_ERROR(
"Sample rate %.1f does not match configured value in file '%s'\n", samprate, infile);
192 if (0 == strcmp(words[0],
"channel_count")) {
193 int nchans = atoi(words[2]);
194 if (cmd_ln_int32_r(wtf->
config,
"-nchans") != nchans) {
195 E_ERROR(
"Number of channels %d does not match configured value in file '%s'\n", nchans, infile);
201 if (detect_endian && 0 == strcmp(words[0],
"sample_byte_format")) {
202 const char *endian = (0 == strcmp(words[2],
"10")) ?
"big" :
"little";
204 E_ERROR(
"Input endian %s does not match configured value in file '%s'\n", endian, infile);
213 fseek(fh, 1024, SEEK_SET);
230 if ((rv = open_nist_file(wtf, wtf->
infile, NULL, FALSE)) != TRUE)
235 if ((fh = popen(cmdline,
"r")) == NULL) {
248 E_ERROR(
"popen() not available, cannot run sph2pipe\n");
264 if ((rv = open_nist_file(wtf, wtf->
infile, &fh, TRUE)) != TRUE)
283 if ((fh = fopen(wtf->
infile,
"rb")) == NULL) {
304 if ((fh = fopen(wtf->
infile,
"rb")) == NULL) {
308 if (fread(&len, 4, 1, fh) != 1) {
313 fseek(fh, 0, SEEK_END);
317 flen = (flen / 4) - 1;
323 E_ERROR(
"Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
330 ?
"little" :
"big"));
333 fseek(fh, 4, SEEK_SET);
344 E_ERROR(
"Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
352 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
357 for (i = whichchan - 1; i < nsamp; i += nchans)
358 buf[i/nchans] = buf[i];
361 for (i = 0; i < nsamp; i += nchans) {
363 for (j = 0; j < nchans && i + j < nsamp; ++j) {
366 buf[i/nchans] = (int16)(tmp / nchans);
380 int32 n, nfr, nchans, whichchan;
383 nchans = cmd_ln_int32_r(wtf->
config,
"-nchans");
384 whichchan = cmd_ln_int32_r(wtf->
config,
"-whichchan");
385 fe_start_stream(wtf->
fe);
386 fe_start_utt(wtf->
fe);
390 int16
const *inspeech;
394 for (n = 0; n < nsamp; ++n)
395 SWAP_INT16(wtf->
audio + n);
400 nsamp = mixnpick_channels(wtf->
audio, nsamp, nchans, whichchan);
402 inspeech = wtf->
audio;
407 fe_process_frames(wtf->
fe, &inspeech, &nsamp, wtf->
feat, &nfr, NULL);
409 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
414 inspeech = wtf->
audio;
417 fe_end_utt(wtf->
fe, wtf->
feat[0], &nfr);
419 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
424 if (fclose(wtf->
infh) == EOF)
445 while ((n = fread(wtf->
feat[0],
sizeof(**wtf->
feat),
449 E_ERROR(
"Size of file %d not a multiple of veclen %d\n",
455 for (i = 0; i < n; ++i)
456 SWAP_FLOAT32(wtf->
feat[0] + i);
458 fe_float_to_mfcc(wtf->
fe, (float32 **)wtf->
feat, wtf->
feat, nfr);
459 for (i = 0; i < nfr; ++i) {
462 fe_logspec_to_mfcc(wtf->
fe, wtf->
feat[i], wtf->
feat[i]);
464 fe_logspec_dct2(wtf->
fe, wtf->
feat[i], wtf->
feat[i]);
467 fe_mfcc_dct3(wtf->
fe, wtf->
feat[i], wtf->
feat[i]);
470 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
475 if (fclose(wtf->
infh) == EOF)
482 {
"-mswav", &detect_riff, &decode_pcm },
483 {
"-nist", &detect_nist, &decode_pcm },
484 {
"-raw", &detect_raw, &decode_pcm },
485 {
"-sph2pipe", &detect_sph2pipe, &decode_pcm }
487 static const int ntypes =
sizeof(types)/
sizeof(types[0]);
489 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
500 if (fwrite(&nfloat, 4, 1, wtf->
outfh) != 1) {
517 fe_mfcc_to_float(wtf->
fe, frames, (float32 **)frames, nfr);
518 for (i = 0; i < nfr; ++i) {
519 if (fwrite(frames[i],
sizeof(float32), wtf->
veclen, wtf->
outfh) != wtf->
veclen) {
529 typedef enum htk_feature_kind_e {
542 } htk_feature_kind_t;
544 typedef enum htk_feature_flag_e {
555 } htk_feature_flag_t;
572 if (swap) SWAP_INT32(&nfloat);
573 if (fwrite(&nfloat, 4, 1, wtf->
outfh) != 1)
576 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->
config,
"-frate"));
577 if (swap) SWAP_INT32(&samp_period);
578 if (fwrite(&samp_period, 4, 1, wtf->
outfh) != 1)
581 samp_size = wtf->
veclen * 4;
582 if (swap) SWAP_INT16(&samp_size);
583 if (fwrite(&samp_size, 2, 1, wtf->
outfh) != 1)
590 param_kind = MFCC | _O;
591 if (swap) SWAP_INT16(¶m_kind);
592 if (fwrite(¶m_kind, 2, 1, wtf->
outfh) != 1)
604 int i, j, swap, htk_reorder, nfloat = 0;
606 fe_mfcc_to_float(wtf->
fe, frames, (float32 **)frames, nfr);
609 htk_reorder = (0 == strcmp(
"htk", wtf->
ot->name)
612 for (i = 0; i < nfr; ++i) {
614 mfcc_t c0 = frames[i][0];
615 memmove(frames[i] + 1, frames[i], (wtf->
veclen - 1) * 4);
616 frames[i][wtf->
veclen - 1] = c0;
619 for (j = 0; j < wtf->
veclen; ++j)
620 SWAP_FLOAT32(frames[i] + j);
621 if (fwrite(frames[i],
sizeof(float32), wtf->
veclen, wtf->
outfh) != wtf->
veclen) {
637 int i, j, nfloat = 0;
639 fe_mfcc_to_float(wtf->
fe, frames, (float32 **)frames, nfr);
640 for (i = 0; i < nfr; ++i) {
641 for (j = 0; j < wtf->
veclen; ++j) {
642 fprintf(wtf->
outfh,
"%.5g", MFCC2FLOAT(frames[i][j]));
644 fprintf(wtf->
outfh,
"\n");
646 fprintf(wtf->
outfh,
" ");
654 {
"sphinx", &output_header_sphinx, &output_frames_sphinx },
655 {
"htk", &output_header_htk, &output_frames_htk },
656 {
"text", NULL, &output_frames_text }
658 static const int nouttypes =
sizeof(outtypes)/
sizeof(outtypes[0]);
661 sphinx_wave2feat_init(
cmd_ln_t *config)
669 wtf->
fe = fe_init_auto_r(wtf->
config);
671 E_FATAL(
"Failed to create feature extraction\n");
675 for (i = 0; i < nouttypes; ++i) {
677 if (0 == strcmp(
cmd_ln_str_r(config,
"-ofmt"), otype->name)) {
682 if (i == nouttypes) {
683 E_ERROR(
"Unknown output type: '%s'\n",
685 sphinx_wave2feat_free(wtf);
709 if (fclose(wtf->
infh) == EOF)
713 if (fclose(wtf->
outfh) == EOF)
739 int rv = mfcc_type.detect(wtf);
746 for (i = 0; i < ntypes; ++i) {
750 rv = (*atype->detect)(wtf);
759 for (i = 0; i < ntypes; ++i) {
762 rv = (*atype->detect)(wtf);
781 char const *infile,
char const *outfile)
783 int nchans, nfloat, veclen;
787 E_INFO(
"Converting %s to %s\n", infile, outfile);
792 if ((atype = detect_audio_type(wtf)) == NULL)
801 wtf->
veclen = fe_get_output_size(wtf->
fe);
804 fe_get_input_size(wtf->
fe, &fshift, &fsize);
808 nchans = cmd_ln_int32_r(wtf->
config,
"-nchans");
810 if (wtf->
blocksize < (fsize + fshift) * nchans) {
811 E_INFO(
"Block size of %d too small, increasing to %d\n",
813 (fsize + fshift) * nchans);
814 wtf->
blocksize = (fsize + fshift) * nchans;
826 if ((wtf->
outfh = fopen(outfile,
"wb")) == NULL) {
831 if (wtf->
ot->output_header &&
832 (*wtf->
ot->output_header)(wtf, 0) < 0) {
838 if ((nfloat = (*atype->decode)(wtf)) < 0) {
843 if (wtf->
ot->output_header) {
844 if (fseek(wtf->
outfh, 0, SEEK_SET) < 0) {
848 if ((*wtf->
ot->output_header)(wtf, nfloat) < 0) {
870 if (fclose(wtf->
outfh) == EOF)
893 if (fclose(wtf->
outfh) == EOF)
901 build_filenames(
cmd_ln_t *config,
char const *basename,
902 char **out_infile,
char **out_outfile)
904 char const *di, *do_, *ei, *eo;
940 int nskip, runlen, npart;
942 if ((ctlfh = fopen(ctlfile,
"r")) == NULL) {
946 nskip = cmd_ln_int32_r(wtf->
config,
"-nskip");
947 runlen = cmd_ln_int32_r(wtf->
config,
"-runlen");
948 if ((npart = cmd_ln_int32_r(wtf->
config,
"-npart"))) {
950 int partlen, part, nlines = 0;
951 part = cmd_ln_int32_r(wtf->
config,
"-part");
954 fseek(ctlfh, 0, SEEK_SET);
955 partlen = nlines / npart;
956 nskip = partlen * (part - 1);
963 E_INFO(
"Processing %d utterances at position %d\n", runlen, nskip);
967 E_INFO(
"Processing all remaining utterances at position %d\n", nskip);
971 char *c, *infile, *outfile;
983 if ((c = strchr(li->buf,
' ')) != NULL)
985 if (strlen(li->buf) == 0) {
986 E_WARN(
"Empty line %d in control file, skipping\n", li->lineno);
989 build_filenames(wtf->
config, li->buf, &infile, &outfile);
992 sphinx_wave2feat_convert_file(wtf, infile, outfile);
1007 main(
int argc,
char *argv[])
1018 if (config == NULL) {
1019 E_ERROR(
"Command line parsing failed\n");
1023 if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1024 E_ERROR(
"Failed to initialize wave2feat object\n");
1032 rv = run_control_file(wtf,
cmd_ln_str_r(config,
"-c"));
1034 rv = sphinx_wave2feat_convert_file(wtf,
cmd_ln_str_r(config,
"-i"),
1037 sphinx_wave2feat_free(wtf);