SphinxBase  5prealpha
sphinx_fe.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <time.h>
41 #include <assert.h>
42 
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46 
47 #include <sphinxbase/fe.h>
48 #include <sphinxbase/strfuncs.h>
49 #include <sphinxbase/pio.h>
50 #include <sphinxbase/filename.h>
51 #include <sphinxbase/cmd_ln.h>
52 #include <sphinxbase/err.h>
53 #include <sphinxbase/ckd_alloc.h>
54 #include <sphinxbase/byteorder.h>
55 #include <sphinxbase/hash_table.h>
56 
57 #include "sphinx_wave2feat.h"
58 #include "cmd_ln_defn.h"
59 
60 typedef struct audio_type_s {
61  char const *name;
62  int (*detect)(sphinx_wave2feat_t *wtf);
63  int (*decode)(sphinx_wave2feat_t *wtf);
64 } audio_type_t;
65 
66 typedef struct output_type_s {
67  char const *name;
68  int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
69  int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
71 
73  int refcount;
75  fe_t *fe;
76  char *infile;
77  char *outfile;
78  FILE *infh;
79  FILE *outfh;
80  short *audio;
81  mfcc_t **feat;
82  int blocksize;
83  int featsize;
84  int veclen;
85  int in_veclen;
86  int byteswap;
87  output_type_t const *ot;
88 };
89 
91 typedef struct RIFFHeader{
92  char rifftag[4]; /* "RIFF" string */
93  int32 TotalLength; /* Total length */
94  char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
95  int32 RemainingLength; /* Remaining length */
96  int16 data_format; /* data format tag, 1 = PCM */
97  int16 numchannels; /* Number of channels in file */
98  int32 SamplingFreq; /* Sampling frequency */
99  int32 BytesPerSec; /* Average bytes/sec */
100  int16 BlockAlign; /* Block align */
101  int16 BitsPerSample; /* 8 or 16 bit */
102  char datatag[4]; /* "data" string */
103  int32 datalength; /* Raw data length */
104 } MSWAV_hdr;
105 
111 static int
112 detect_riff(sphinx_wave2feat_t *wtf)
113 {
114  FILE *fh;
115  MSWAV_hdr hdr;
116 
117  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
118  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
119  return -1;
120  }
121  if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
122  E_ERROR("Failed to read RIFF header");
123  fclose(fh);
124  return -1;
125  }
126  /* Make sure it is actually a RIFF file. */
127  if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
128  fclose(fh);
129  return FALSE;
130  }
131  if (cmd_ln_int32_r(wtf->config, "-nchans") != hdr.numchannels) {
132  E_ERROR("Number of channels %d does not match configured value in file '%s'\n", hdr.numchannels, wtf->infile);
133  fclose(fh);
134  return -1;
135  }
136  if (cmd_ln_float32_r(wtf->config, "-samprate") != hdr.SamplingFreq) {
137  E_ERROR("Sample rate %.1f does not match configured value in file '%s'\n", hdr.SamplingFreq, wtf->infile);
138  fclose(fh);
139  return -1;
140  }
141  wtf->infh = fh;
142 
143  return TRUE;
144 }
145 
146 static int
147 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
148 {
149  char nist[7];
150  lineiter_t *li;
151  FILE *fh;
152 
153  if ((fh = fopen(infile, "rb")) == NULL) {
154  E_ERROR_SYSTEM("Failed to open %s", infile);
155  return -1;
156  }
157  if (fread(&nist, 1, 7, fh) != 7) {
158  E_ERROR_SYSTEM("Failed to read NIST header");
159  fclose(fh);
160  return -1;
161  }
162  /* Is this actually a NIST file? */
163  if (0 != strncmp(nist, "NIST_1A", 7)) {
164  fclose(fh);
165  return FALSE;
166  }
167  /* Rewind, parse lines. */
168  fseek(fh, 0, SEEK_SET);
169  for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
170  char **words;
171  int nword;
172 
173  string_trim(li->buf, STRING_BOTH);
174  if (strlen(li->buf) == 0) {
175  lineiter_free(li);
176  break;
177  }
178  nword = str2words(li->buf, NULL, 0);
179  if (nword != 3)
180  continue;
181  words = (char **)ckd_calloc(nword, sizeof(*words));
182  str2words(li->buf, words, nword);
183  if (0 == strcmp(words[0], "sample_rate")) {
184  float samprate = atof_c(words[2]);
185  if (cmd_ln_float32_r(wtf->config, "-samprate") != samprate) {
186  E_ERROR("Sample rate %.1f does not match configured value in file '%s'\n", samprate, infile);
187  lineiter_free(li);
188  fclose(fh);
189  return -1;
190  }
191  }
192  if (0 == strcmp(words[0], "channel_count")) {
193  int nchans = atoi(words[2]);
194  if (cmd_ln_int32_r(wtf->config, "-nchans") != nchans) {
195  E_ERROR("Number of channels %d does not match configured value in file '%s'\n", nchans, infile);
196  lineiter_free(li);
197  fclose(fh);
198  return -1;
199  }
200  }
201  if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
202  const char *endian = (0 == strcmp(words[2], "10")) ? "big" : "little";
203  if (0 != strcmp(cmd_ln_str_r(wtf->config, "-input_endian"), endian)) {
204  E_ERROR("Input endian %s does not match configured value in file '%s'\n", endian, infile);
205  lineiter_free(li);
206  fclose(fh);
207  return -1;
208  }
209  }
210  ckd_free(words);
211  }
212 
213  fseek(fh, 1024, SEEK_SET);
214  if (out_fh)
215  *out_fh = fh;
216  else
217  fclose(fh);
218  return TRUE;
219 }
220 
221 #ifdef HAVE_POPEN
222 static int
223 detect_sph2pipe(sphinx_wave2feat_t *wtf)
224 {
225  FILE *fh;
226  char *cmdline;
227  int rv;
228 
229  /* Determine if it's NIST file and get parameters. */
230  if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
231  return rv;
232 
233  /* Now popen it with sph2pipe. */
234  cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
235  if ((fh = popen(cmdline, "r")) == NULL) {
236  E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
237  ckd_free(cmdline);
238  return -1;
239  }
240 
241  wtf->infh = fh;
242  return TRUE;
243 }
244 #else /* !HAVE_POPEN */
245 static int
246 detect_sph2pipe(sphinx_wave2feat_t *wtf)
247 {
248  E_ERROR("popen() not available, cannot run sph2pipe\n");
249  return -1;
250 }
251 #endif /* !HAVE_POPEN */
252 
258 static int
259 detect_nist(sphinx_wave2feat_t *wtf)
260 {
261  FILE *fh;
262  int rv;
263 
264  if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
265  return rv;
266  wtf->infh = fh;
267 
268  return TRUE;
269 }
270 
271 
278 static int
279 detect_raw(sphinx_wave2feat_t *wtf)
280 {
281  FILE *fh;
282 
283  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
284  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
285  return -1;
286  }
287  wtf->infh = fh;
288  return TRUE;
289 }
290 
297 static int
298 detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
299 {
300  FILE *fh;
301  int32 len;
302  long flen;
303 
304  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
305  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
306  return -1;
307  }
308  if (fread(&len, 4, 1, fh) != 1) {
309  E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
310  fclose(fh);
311  return -1;
312  }
313  fseek(fh, 0, SEEK_END);
314  flen = ftell(fh);
315 
316  /* figure out whether to byteswap */
317  flen = (flen / 4) - 1;
318  if (flen != len) {
319  /* First make sure this is an endianness problem, otherwise fail. */
320  SWAP_INT32(&len);
321  if (flen != len) {
322  SWAP_INT32(&len);
323  E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
324  len, flen);
325  return -1;
326  }
327  /* Set the input endianness to the opposite of the machine endianness... */
328  cmd_ln_set_str_r(wtf->config, "-input_endian",
329  (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
330  ? "little" : "big"));
331  }
332 
333  fseek(fh, 4, SEEK_SET);
334  wtf->infh = fh;
335  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
336  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
337  }
338  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
339  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
340  wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
341  }
342  else {
343  /* Should not happen. */
344  E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
345  assert(FALSE);
346  }
347 
348  return TRUE;
349 }
350 
351 int
352 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
353 {
354  int i, j;
355 
356  if (whichchan > 0) {
357  for (i = whichchan - 1; i < nsamp; i += nchans)
358  buf[i/nchans] = buf[i];
359  }
360  else {
361  for (i = 0; i < nsamp; i += nchans) {
362  float64 tmp = 0.0;
363  for (j = 0; j < nchans && i + j < nsamp; ++j) {
364  tmp += buf[i + j];
365  }
366  buf[i/nchans] = (int16)(tmp / nchans);
367  }
368  }
369  return i/nchans;
370 }
371 
376 static int
377 decode_pcm(sphinx_wave2feat_t *wtf)
378 {
379  size_t nsamp;
380  int32 n, nfr, nchans, whichchan;
381  uint32 nfloat;
382 
383  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
384  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
385  fe_start_stream(wtf->fe);
386  fe_start_utt(wtf->fe);
387  nfloat = 0;
388  while ((nsamp = fread(wtf->audio, sizeof(int16), wtf->blocksize, wtf->infh)) != 0) {
389  size_t nvec;
390  int16 const *inspeech;
391 
392  /* Byteswap stuff here if necessary. */
393  if (wtf->byteswap) {
394  for (n = 0; n < nsamp; ++n)
395  SWAP_INT16(wtf->audio + n);
396  }
397 
398  /* Mix or pick channels. */
399  if (nchans > 1)
400  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
401 
402  inspeech = wtf->audio;
403  nvec = wtf->featsize;
404  /* Consume all samples. */
405  while (nsamp) {
406  nfr = nvec;
407  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr, NULL);
408  if (nfr) {
409  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
410  return -1;
411  nfloat += n;
412  }
413  }
414  inspeech = wtf->audio;
415  }
416  /* Now process any leftover audio frames. */
417  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
418  if (nfr) {
419  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
420  return -1;
421  nfloat += n;
422  }
423 
424  if (fclose(wtf->infh) == EOF)
425  E_ERROR_SYSTEM("Failed to close input file");
426  wtf->infh = NULL;
427  return nfloat;
428 }
429 
434 static int
435 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
436 {
437  int nfloat = 0, n;
438  int featsize = wtf->featsize;
439 
440  /* If the input vector length is less than the output length, we
441  * need to do this one frame at a time, because there's empty
442  * space at the end of each vector in wtf->feat. */
443  if (wtf->in_veclen < wtf->veclen)
444  featsize = 1;
445  while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
446  featsize * wtf->in_veclen, wtf->infh)) != 0) {
447  int i, nfr = n / wtf->in_veclen;
448  if (n % wtf->in_veclen) {
449  E_ERROR("Size of file %d not a multiple of veclen %d\n",
450  n, wtf->in_veclen);
451  return -1;
452  }
453  /* Byteswap stuff here if necessary. */
454  if (wtf->byteswap) {
455  for (i = 0; i < n; ++i)
456  SWAP_FLOAT32(wtf->feat[0] + i);
457  }
458  fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
459  for (i = 0; i < nfr; ++i) {
460  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
461  if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
462  fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
463  else
464  fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
465  }
466  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
467  fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
468  }
469  }
470  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
471  return -1;
472  nfloat += n;
473  }
474 
475  if (fclose(wtf->infh) == EOF)
476  E_ERROR_SYSTEM("Failed to close input file");
477  wtf->infh = NULL;
478  return nfloat;
479 }
480 
481 static const audio_type_t types[] = {
482  { "-mswav", &detect_riff, &decode_pcm },
483  { "-nist", &detect_nist, &decode_pcm },
484  { "-raw", &detect_raw, &decode_pcm },
485  { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
486 };
487 static const int ntypes = sizeof(types)/sizeof(types[0]);
488 static const audio_type_t mfcc_type = {
489  "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
490 };
491 
497 static int
498 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
499 {
500  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
501  E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
502  return -1;
503  }
504  return 0;
505 }
506 
512 static int
513 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
514 {
515  int i, nfloat = 0;
516 
517  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
518  for (i = 0; i < nfr; ++i) {
519  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
520  E_ERROR_SYSTEM("Writing %d values to %s failed",
521  wtf->veclen, wtf->outfile);
522  return -1;
523  }
524  nfloat += wtf->veclen;
525  }
526  return nfloat;
527 }
528 
529 typedef enum htk_feature_kind_e {
530  WAVEFORM = 0, /* PCM audio (rarely used) */
531  LPC = 1, /* LPC filter coefficients */
532  LPCREFC = 2, /* LPC reflection coefficients */
533  LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
534  LPCDELCEP = 4, /* LPCC plus deltas */
535  IREFC = 5, /* 16-bit integer LPC reflection coefficients */
536  MFCC = 6, /* MFCCs */
537  FBANK = 7, /* Log mel spectrum */
538  MELSPEC = 8, /* Linear mel spectrum */
539  USER = 9, /* User defined */
540  DISCRETE = 10, /* Vector quantized data */
541  PLP = 11 /* PLP coefficients */
542 } htk_feature_kind_t;
543 
544 typedef enum htk_feature_flag_e {
545  _E = 0000100, /* has energy */
546  _N = 0000200, /* absolute energy supressed */
547  _D = 0000400, /* has delta coefficients */
548  _A = 0001000, /* has acceleration (delta-delta) coefficients */
549  _C = 0002000, /* is compressed */
550  _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
551  _K = 0010000, /* has CRC checksum */
552  _O = 0020000, /* has 0th cepstral coefficient */
553  _V = 0040000, /* has VQ data */
554  _T = 0100000 /* has third differential coefficients */
555 } htk_feature_flag_t;
556 
560 static int
561 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
562 {
563  int32 samp_period;
564  int16 samp_size;
565  int16 param_kind;
566  int swap = FALSE;
567 
568  /* HTK files are big-endian. */
569  if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
570  swap = TRUE;
571  /* Same file size thing as in Sphinx files (I think) */
572  if (swap) SWAP_INT32(&nfloat);
573  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
574  return -1;
575  /* Sample period in 100ns units. */
576  samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
577  if (swap) SWAP_INT32(&samp_period);
578  if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
579  return -1;
580  /* Sample size - veclen * sizeof each sample. */
581  samp_size = wtf->veclen * 4;
582  if (swap) SWAP_INT16(&samp_size);
583  if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
584  return -1;
585  /* Format and flags. */
586  if (cmd_ln_boolean_r(wtf->config, "-logspec")
587  || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
588  param_kind = FBANK; /* log mel-filter bank outputs */
589  else
590  param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
591  if (swap) SWAP_INT16(&param_kind);
592  if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
593  return -1;
594 
595  return 0;
596 }
597 
601 static int
602 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
603 {
604  int i, j, swap, htk_reorder, nfloat = 0;
605 
606  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
607  /* This is possibly inefficient, but probably not a big deal. */
608  swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
609  htk_reorder = (0 == strcmp("htk", wtf->ot->name)
610  && !(cmd_ln_boolean_r(wtf->config, "-logspec")
611  || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
612  for (i = 0; i < nfr; ++i) {
613  if (htk_reorder) {
614  mfcc_t c0 = frames[i][0];
615  memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
616  frames[i][wtf->veclen - 1] = c0;
617  }
618  if (swap)
619  for (j = 0; j < wtf->veclen; ++j)
620  SWAP_FLOAT32(frames[i] + j);
621  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
622  E_ERROR_SYSTEM("Writing %d values to %s failed",
623  wtf->veclen, wtf->outfile);
624  return -1;
625  }
626  nfloat += wtf->veclen;
627  }
628  return nfloat;
629 }
630 
634 static int
635 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
636 {
637  int i, j, nfloat = 0;
638 
639  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
640  for (i = 0; i < nfr; ++i) {
641  for (j = 0; j < wtf->veclen; ++j) {
642  fprintf(wtf->outfh, "%.5g", MFCC2FLOAT(frames[i][j]));
643  if (j == wtf->veclen - 1)
644  fprintf(wtf->outfh, "\n");
645  else
646  fprintf(wtf->outfh, " ");
647  }
648  nfloat += wtf->veclen;
649  }
650  return nfloat;
651 }
652 
653 static const output_type_t outtypes[] = {
654  { "sphinx", &output_header_sphinx, &output_frames_sphinx },
655  { "htk", &output_header_htk, &output_frames_htk },
656  { "text", NULL, &output_frames_text }
657 };
658 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
659 
661 sphinx_wave2feat_init(cmd_ln_t *config)
662 {
663  sphinx_wave2feat_t *wtf;
664  int i;
665 
666  wtf = (sphinx_wave2feat_t *)ckd_calloc(1, sizeof(*wtf));
667  wtf->refcount = 1;
668  wtf->config = cmd_ln_retain(config);
669  wtf->fe = fe_init_auto_r(wtf->config);
670  if (!wtf->fe) {
671  E_FATAL("Failed to create feature extraction\n");
672  }
673 
674  wtf->ot = outtypes; /* Default (sphinx) type. */
675  for (i = 0; i < nouttypes; ++i) {
676  output_type_t const *otype = &outtypes[i];
677  if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
678  wtf->ot = otype;
679  break;
680  }
681  }
682  if (i == nouttypes) {
683  E_ERROR("Unknown output type: '%s'\n",
684  cmd_ln_str_r(config, "-ofmt"));
685  sphinx_wave2feat_free(wtf);
686  return NULL;
687  }
688 
689  return wtf;
690 }
691 
692 int
693 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
694 {
695  if (wtf == NULL)
696  return 0;
697  if (--wtf->refcount > 0)
698  return wtf->refcount;
699 
700  if (wtf->audio)
701  ckd_free(wtf->audio);
702  if (wtf->feat)
703  ckd_free_2d(wtf->feat);
704  if (wtf->infile)
705  ckd_free(wtf->infile);
706  if (wtf->outfile)
707  ckd_free(wtf->outfile);
708  if (wtf->infh) {
709  if (fclose(wtf->infh) == EOF)
710  E_ERROR_SYSTEM("Failed to close input file");
711  }
712  if (wtf->outfh) {
713  if (fclose(wtf->outfh) == EOF)
714  E_ERROR_SYSTEM("Failed to close output file");
715  }
716  cmd_ln_free_r(wtf->config);
717  fe_free(wtf->fe);
718  ckd_free(wtf);
719 
720  return 0;
721 }
722 
724 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
725 {
726  ++wtf->refcount;
727  return wtf;
728 }
729 
730 static audio_type_t const *
731 detect_audio_type(sphinx_wave2feat_t *wtf)
732 {
733  audio_type_t const *atype = NULL;
734  int i;
735 
736  /* Special case audio type for Sphinx MFCC inputs. */
737  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
738  || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
739  int rv = mfcc_type.detect(wtf);
740  if (rv == -1)
741  goto error_out;
742  return &mfcc_type;
743  }
744 
745  /* Try to use the type of infile given on the command line. */
746  for (i = 0; i < ntypes; ++i) {
747  int rv;
748  atype = &types[i];
749  if (cmd_ln_boolean_r(wtf->config, atype->name)) {
750  rv = (*atype->detect)(wtf);
751  if (rv == -1)
752  goto error_out;
753  else if (rv == TRUE)
754  break;
755  }
756  }
757  if (i == ntypes) {
758  /* Detect file type of infile and get parameters. */
759  for (i = 0; i < ntypes; ++i) {
760  int rv;
761  atype = &types[i];
762  rv = (*atype->detect)(wtf);
763  if (rv == -1)
764  goto error_out;
765  else if (rv == TRUE)
766  break;
767  }
768  if (i == ntypes)
769  goto error_out;
770  }
771  return atype;
772  error_out:
773  if (wtf->infh)
774  fclose(wtf->infh);
775  wtf->infh = NULL;
776  return NULL;
777 }
778 
779 int
780 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
781  char const *infile, char const *outfile)
782 {
783  int nchans, nfloat, veclen;
784  audio_type_t const *atype = NULL;
785  int fshift, fsize;
786 
787  E_INFO("Converting %s to %s\n", infile, outfile);
788 
789  wtf->infile = ckd_salloc(infile);
790 
791  /* Detect input file type. */
792  if ((atype = detect_audio_type(wtf)) == NULL)
793  return -1;
794 
795  /* Determine whether to byteswap input. */
796  wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
797  cmd_ln_str_r(wtf->config, "-input_endian"));
798 
799  /* Get the output frame size (if not already set). */
800  if (wtf->veclen == 0)
801  wtf->veclen = fe_get_output_size(wtf->fe);
802 
803  /* Set up the input and output buffers. */
804  fe_get_input_size(wtf->fe, &fshift, &fsize);
805  /* Want to get at least a whole frame plus shift in here. Also we
806  will either pick or mix multiple channels so we need to read
807  them all at once. */
808  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
809  wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
810  if (wtf->blocksize < (fsize + fshift) * nchans) {
811  E_INFO("Block size of %d too small, increasing to %d\n",
812  wtf->blocksize,
813  (fsize + fshift) * nchans);
814  wtf->blocksize = (fsize + fshift) * nchans;
815  }
816  wtf->audio = (short *)ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
817  wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
818 
819  /* Use the maximum of the input and output frame sizes to allocate this. */
820  veclen = wtf->veclen;
821  if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
822 
823  wtf->feat = (mfcc_t**)ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
824 
825  /* Let's go! */
826  if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
827  E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
828  return -1;
829  }
830  /* Write an empty header, which we'll fill in later. */
831  if (wtf->ot->output_header &&
832  (*wtf->ot->output_header)(wtf, 0) < 0) {
833  E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
834  goto error_out;
835  }
836  wtf->outfile = ckd_salloc(outfile);
837 
838  if ((nfloat = (*atype->decode)(wtf)) < 0) {
839  E_ERROR("Failed to convert");
840  goto error_out;
841  }
842 
843  if (wtf->ot->output_header) {
844  if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
845  E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
846  goto error_out;
847  }
848  if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
849  E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
850  goto error_out;
851  }
852  }
853 
854 
855  if (wtf->audio)
856  ckd_free(wtf->audio);
857  if (wtf->feat)
858  ckd_free_2d(wtf->feat);
859  if (wtf->infile)
860  ckd_free(wtf->infile);
861  if (wtf->outfile)
862  ckd_free(wtf->outfile);
863 
864  wtf->audio = NULL;
865  wtf->infile = NULL;
866  wtf->feat = NULL;
867  wtf->outfile = NULL;
868 
869  if (wtf->outfh)
870  if (fclose(wtf->outfh) == EOF)
871  E_ERROR_SYSTEM("Failed to close output file");
872  wtf->outfh = NULL;
873 
874  return 0;
875 
876 error_out:
877 
878  if (wtf->audio)
879  ckd_free(wtf->audio);
880  if (wtf->feat)
881  ckd_free_2d(wtf->feat);
882  if (wtf->infile)
883  ckd_free(wtf->infile);
884  if (wtf->outfile)
885  ckd_free(wtf->outfile);
886 
887  wtf->audio = NULL;
888  wtf->infile = NULL;
889  wtf->feat = NULL;
890  wtf->outfile = NULL;
891 
892  if (wtf->outfh)
893  if (fclose(wtf->outfh) == EOF)
894  E_ERROR_SYSTEM("Failed to close output file");
895  wtf->outfh = NULL;
896 
897  return -1;
898 }
899 
900 void
901 build_filenames(cmd_ln_t *config, char const *basename,
902  char **out_infile, char **out_outfile)
903 {
904  char const *di, *do_, *ei, *eo;
905 
906  di = cmd_ln_str_r(config, "-di");
907  do_ = cmd_ln_str_r(config, "-do");
908  ei = cmd_ln_str_r(config, "-ei");
909  eo = cmd_ln_str_r(config, "-eo");
910 
911  *out_infile = string_join(di ? di : "",
912  di ? "/" : "",
913  basename,
914  ei ? "." : "",
915  ei ? ei : "",
916  NULL);
917  *out_outfile = string_join(do_ ? do_ : "",
918  do_ ? "/" : "",
919  basename,
920  eo ? "." : "",
921  eo ? eo : "",
922  NULL);
923  /* Build output directory structure if possible/requested (it is
924  * by default). */
925  if (cmd_ln_boolean_r(config, "-build_outdirs")) {
926  char *dirname = ckd_salloc(*out_outfile);
927  path2dirname(*out_outfile, dirname);
928  build_directory(dirname);
929  ckd_free(dirname);
930  }
931 }
932 
933 static int
934 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
935 {
936  hash_table_t *files;
937  hash_iter_t *itor;
938  lineiter_t *li;
939  FILE *ctlfh;
940  int nskip, runlen, npart;
941 
942  if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
943  E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
944  return -1;
945  }
946  nskip = cmd_ln_int32_r(wtf->config, "-nskip");
947  runlen = cmd_ln_int32_r(wtf->config, "-runlen");
948  if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
949  /* Count lines in the file. */
950  int partlen, part, nlines = 0;
951  part = cmd_ln_int32_r(wtf->config, "-part");
952  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
953  ++nlines;
954  fseek(ctlfh, 0, SEEK_SET);
955  partlen = nlines / npart;
956  nskip = partlen * (part - 1);
957  if (part == npart)
958  runlen = -1;
959  else
960  runlen = partlen;
961  }
962  if (runlen != -1){
963  E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
964  files = hash_table_new(runlen, HASH_CASE_YES);
965  }
966  else {
967  E_INFO("Processing all remaining utterances at position %d\n", nskip);
968  files = hash_table_new(1000, HASH_CASE_YES);
969  }
970  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
971  char *c, *infile, *outfile;
972 
973  if (nskip-- > 0)
974  continue;
975  if (runlen == 0) {
976  lineiter_free(li);
977  break;
978  }
979  --runlen;
980 
981  string_trim(li->buf, STRING_BOTH);
982  /* Extract the file ID from the control line. */
983  if ((c = strchr(li->buf, ' ')) != NULL)
984  *c = '\0';
985  if (strlen(li->buf) == 0) {
986  E_WARN("Empty line %d in control file, skipping\n", li->lineno);
987  continue;
988  }
989  build_filenames(wtf->config, li->buf, &infile, &outfile);
990  if (hash_table_lookup(files, infile, NULL) == 0)
991  continue;
992  sphinx_wave2feat_convert_file(wtf, infile, outfile);
993  hash_table_enter(files, infile, outfile);
994  }
995  for (itor = hash_table_iter(files); itor;
996  itor = hash_table_iter_next(itor)) {
997  ckd_free((void *)hash_entry_key(itor->ent));
998  ckd_free(hash_entry_val(itor->ent));
999  }
1000  hash_table_free(files);
1001  fclose(ctlfh);
1002 
1003  return 0;
1004 }
1005 
1006 int
1007 main(int argc, char *argv[])
1008 {
1009  sphinx_wave2feat_t *wtf;
1010  cmd_ln_t *config;
1011  int rv;
1012 
1013  config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE);
1014 
1015  if (config && cmd_ln_str_r(config, "-argfile"))
1016  config = cmd_ln_parse_file_r(config, defn,
1017  cmd_ln_str_r(config, "-argfile"), FALSE);
1018  if (config == NULL) {
1019  E_ERROR("Command line parsing failed\n");
1020  return 1;
1021  }
1022 
1023  if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1024  E_ERROR("Failed to initialize wave2feat object\n");
1025  return 1;
1026  }
1027 
1028  /* If there's a control file run through it, otherwise we will do
1029  * a single file (which is what run_control_file will do
1030  * internally too) */
1031  if (cmd_ln_str_r(config, "-c"))
1032  rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1033  else
1034  rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1035  cmd_ln_str_r(config, "-o"));
1036 
1037  sphinx_wave2feat_free(wtf);
1038  cmd_ln_free_r(config);
1039  return rv;
1040 }
hash_iter_s::ent
hash_entry_t * ent
Current entry in that table.
Definition: hash_table.h:170
hash_iter_s
Definition: hash_table.h:168
string_trim
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:97
output_type_s
Definition: sphinx_fe.c:66
lineiter_start
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:264
sphinx_wave2feat_s
Definition: sphinx_fe.c:72
cmd_ln_t
E_INFO
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
sphinx_wave2feat_s::byteswap
int byteswap
Whether byteswapping is necessary.
Definition: sphinx_fe.c:86
STRING_BOTH
@ STRING_BOTH
Both ends of string.
Definition: strfuncs.h:73
cmd_ln_str_r
SPHINXBASE_EXPORT const char * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
hash_table_lookup
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:302
hash_table_new
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
cmd_ln_parse_r
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:556
sphinx_wave2feat_s::veclen
int veclen
Length of each output vector.
Definition: sphinx_fe.c:84
sphinx_wave2feat_s::audio
short * audio
Audio buffer.
Definition: sphinx_fe.c:80
strfuncs.h
Miscellaneous useful string functions.
string_join
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition: strfuncs.c:70
cmd_ln_free_r
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1046
E_ERROR_SYSTEM
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition: err.h:99
filename.h
File names related operation.
hash_table_free
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:688
ckd_calloc_2d
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
ckd_free
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
hash_table_iter_next
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition: hash_table.c:656
sphinx_wave2feat_s::ot
const output_type_t * ot
Output type object.
Definition: sphinx_fe.c:87
ckd_salloc
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
str2words
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
sphinx_wave2feat_s::infh
FILE * infh
Input file handle.
Definition: sphinx_fe.c:78
err.h
Implementation of logging routines.
cmd_ln.h
Command-line and other configurationparsing and handling.
hash_table_enter
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:501
hash_table_iter
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition: hash_table.c:646
path2dirname
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition: filename.c:68
hash_entry_val
#define hash_entry_val(e)
Access macros.
Definition: hash_table.h:175
ckd_free_2d
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
sphinx_wave2feat_s::feat
mfcc_t ** feat
Feature buffer.
Definition: sphinx_fe.c:81
audio_type_s
Definition: sphinx_fe.c:60
E_ERROR
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
hash_table_s
Definition: hash_table.h:159
cmd_ln_parse_file_r
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:764
RIFFHeader
RIFF 44-byte header structure for MS wav files.
Definition: sphinx_fe.c:91
sphinx_wave2feat_s::config
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:74
lineiter_next
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
pio.h
file IO related operations.
lineiter_free
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:368
sphinx_wave2feat_s::infile
char * infile
Path to input file.
Definition: sphinx_fe.c:76
sphinx_wave2feat_s::fe
fe_t * fe
Front end object.
Definition: sphinx_fe.c:75
build_directory
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition: pio.c:621
sphinx_wave2feat_s::outfh
FILE * outfh
Output file handle.
Definition: sphinx_fe.c:79
cmd_ln_boolean_r
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
sphinx_wave2feat_s::featsize
int featsize
Size of feature buffer.
Definition: sphinx_fe.c:83
ckd_alloc.h
Sphinx's memory allocation/deallocation routines.
sphinx_wave2feat_s::blocksize
int blocksize
Size of audio buffer.
Definition: sphinx_fe.c:82
fe_s
Structure for the front-end computation.
Definition: fe_internal.h:117
ckd_calloc
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
cmd_ln_set_str_r
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition: cmd_ln.c:989
E_FATAL
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
lineiter_t
Line iterator for files.
Definition: pio.h:177
sphinx_wave2feat_s::outfile
char * outfile
Path to output file.
Definition: sphinx_fe.c:77
hash_table.h
Hash table implementation.
sphinx_wave2feat_s::refcount
int refcount
Reference count.
Definition: sphinx_fe.c:73
atof_c
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
E_WARN
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
sphinx_wave2feat_s::in_veclen
int in_veclen
Length of each input vector (for cep<->spec).
Definition: sphinx_fe.c:85
cmd_ln_retain
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition: cmd_ln.c:1039