SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 00038 #include <stdio.h> 00039 #include <string.h> 00040 00041 #include <sphinxbase/cmd_ln.h> 00042 #include <sphinxbase/yin.h> 00043 #include <sphinxbase/ckd_alloc.h> 00044 #include <sphinxbase/byteorder.h> 00045 #include <sphinxbase/strfuncs.h> 00046 #include <sphinxbase/err.h> 00047 #include <sphinxbase/pio.h> 00048 00049 #ifndef WORDS_BIGENDIAN 00050 #define WORDS_BIGENDIAN 0 00051 #endif 00052 00053 static arg_t defn[] = { 00054 { "-i", 00055 ARG_STRING, 00056 NULL, 00057 "Single audio input file" }, 00058 00059 { "-o", 00060 ARG_STRING, 00061 NULL, 00062 "Single text output file (standard output will be used if not given)" }, 00063 00064 { "-c", 00065 ARG_STRING, 00066 NULL, 00067 "Control file for batch processing" }, 00068 00069 { "-nskip", 00070 ARG_INT32, 00071 "0", 00072 "If a control file was specified, the number of utterances to skip at the head of the file" }, 00073 00074 { "-runlen", 00075 ARG_INT32, 00076 "-1", 00077 "If a control file was specified, the number of utterances to process (see -nskip too)" }, 00078 00079 { "-di", 00080 ARG_STRING, 00081 NULL, 00082 "Input directory, input file names are relative to this, if defined" }, 00083 00084 { "-ei", 00085 ARG_STRING, 00086 NULL, 00087 "Input extension to be applied to all input files" }, 00088 00089 { "-do", 00090 ARG_STRING, 00091 NULL, 00092 "Output directory, output files are relative to this" }, 00093 00094 { "-eo", 00095 ARG_STRING, 00096 NULL, 00097 "Output extension to be applied to all output files" }, 00098 00099 { "-nist", 00100 ARG_BOOLEAN, 00101 "no", 00102 "Defines input format as NIST sphere" }, 00103 00104 { "-raw", 00105 ARG_BOOLEAN, 00106 "no", 00107 "Defines input format as raw binary data" }, 00108 00109 { "-mswav", 00110 ARG_BOOLEAN, 00111 "no", 00112 "Defines input format as Microsoft Wav (RIFF)" }, 00113 00114 { "-samprate", 00115 ARG_INT32, 00116 "0", 00117 "Sampling rate of audio data (will be determined automatically if 0)" }, 00118 00119 { "-input_endian", 00120 ARG_STRING, 00121 NULL, 00122 "Endianness of audio data (will be determined automatically if not given)" }, 00123 00124 { "-fshift", 00125 ARG_FLOAT32, 00126 "0.01", 00127 "Frame shift: number of seconds between each analysis frame." }, 00128 00129 { "-flen", 00130 ARG_FLOAT32, 00131 "0.025", 00132 "Number of seconds in each analysis frame (needs to be greater than twice the longest period you wish to detect - to detect down to 80Hz you need a frame length of 2.0/80 = 0.025)." }, 00133 00134 { "-smooth_window", 00135 ARG_INT32, 00136 "2", 00137 "Number of frames on either side of the current frame to use for smoothing." }, 00138 00139 { "-voice_thresh", 00140 ARG_FLOAT32, 00141 "0.1", 00142 "Threshold of normalized difference under which to search for the fundamental period." }, 00143 00144 { "-search_range", 00145 ARG_FLOAT32, 00146 "0.2", 00147 "Fraction of the best local estimate to use as a search range for smoothing." }, 00148 00149 { NULL, 0, NULL, NULL } 00150 }; 00151 00152 static int extract_pitch(const char *in, const char *out); 00153 static int run_control_file(const char *ctl); 00154 00155 int 00156 main(int argc, char *argv[]) 00157 { 00158 cmd_ln_parse(defn, argc, argv, TRUE); 00159 00160 /* Run a control file if requested. */ 00161 if (cmd_ln_str("-c")) { 00162 if (run_control_file(cmd_ln_str("-c")) < 0) 00163 return 1; 00164 } 00165 else { 00166 if (extract_pitch(cmd_ln_str("-i"), cmd_ln_str("-o")) < 0) 00167 return 1; 00168 } 00169 00170 cmd_ln_free(); 00171 return 0; 00172 } 00173 00174 static int 00175 guess_file_type(char const *file, FILE *infh) 00176 { 00177 char header[4]; 00178 00179 fseek(infh, 0, SEEK_SET); 00180 if (fread(header, 1, 4, infh) != 4) { 00181 E_ERROR_SYSTEM("Failed to read 4 byte header"); 00182 return -1; 00183 } 00184 if (0 == memcmp(header, "RIFF", 4)) { 00185 E_INFO("%s appears to be a WAV file\n", file); 00186 cmd_ln_set_boolean("-mswav", TRUE); 00187 cmd_ln_set_boolean("-nist", FALSE); 00188 cmd_ln_set_boolean("-raw", FALSE); 00189 } 00190 else if (0 == memcmp(header, "NIST", 4)) { 00191 E_INFO("%s appears to be a NIST SPHERE file\n", file); 00192 cmd_ln_set_boolean("-mswav", FALSE); 00193 cmd_ln_set_boolean("-nist", TRUE); 00194 cmd_ln_set_boolean("-raw", FALSE); 00195 } 00196 else { 00197 E_INFO("%s appears to be raw data\n", file); 00198 cmd_ln_set_boolean("-mswav", FALSE); 00199 cmd_ln_set_boolean("-nist", FALSE); 00200 cmd_ln_set_boolean("-raw", TRUE); 00201 } 00202 fseek(infh, 0, SEEK_SET); 00203 return 0; 00204 } 00205 00206 #define TRY_FREAD(ptr, size, nmemb, stream) \ 00207 if (fread(ptr, size, nmemb, stream) != (nmemb)) { \ 00208 E_ERROR_SYSTEM("Failed to read %d bytes", size * nmemb); \ 00209 goto error_out; \ 00210 } 00211 00212 static int 00213 read_riff_header(FILE *infh) 00214 { 00215 char id[4]; 00216 int32 intval, header_len; 00217 int16 shortval; 00218 00219 /* RIFF files are little-endian by definition. */ 00220 cmd_ln_set_str("-input_endian", "little"); 00221 00222 /* Read in all the header chunks and etcetera. */ 00223 TRY_FREAD(id, 1, 4, infh); 00224 /* Total file length (we don't care) */ 00225 TRY_FREAD(&intval, 4, 1, infh); 00226 /* 'WAVE' */ 00227 TRY_FREAD(id, 1, 4, infh); 00228 if (0 != memcmp(id, "WAVE", 4)) { 00229 E_ERROR("This is not a WAVE file\n"); 00230 goto error_out; 00231 } 00232 /* 'fmt ' */ 00233 TRY_FREAD(id, 1, 4, infh); 00234 if (0 != memcmp(id, "fmt ", 4)) { 00235 E_ERROR("Format chunk missing\n"); 00236 goto error_out; 00237 } 00238 /* Length of 'fmt ' chunk */ 00239 TRY_FREAD(&intval, 4, 1, infh); 00240 if (WORDS_BIGENDIAN) SWAP_INT32(&intval); 00241 header_len = intval; 00242 00243 /* Data format. */ 00244 TRY_FREAD(&shortval, 2, 1, infh); 00245 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval); 00246 if (shortval != 1) { /* PCM */ 00247 E_ERROR("WAVE file is not in PCM format\n"); 00248 goto error_out; 00249 } 00250 00251 /* Number of channels. */ 00252 TRY_FREAD(&shortval, 2, 1, infh); 00253 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval); 00254 if (shortval != 1) { /* PCM */ 00255 E_ERROR("WAVE file is not single channel\n"); 00256 goto error_out; 00257 } 00258 00259 /* Sampling rate (finally!) */ 00260 TRY_FREAD(&intval, 4, 1, infh); 00261 if (WORDS_BIGENDIAN) SWAP_INT32(&intval); 00262 if (cmd_ln_int32("-samprate") == 0) 00263 cmd_ln_set_int32("-samprate", intval); 00264 else if (cmd_ln_int32("-samprate") != intval) { 00265 E_WARN("WAVE file sampling rate %d != -samprate %d\n", 00266 intval, cmd_ln_int32("-samprate")); 00267 } 00268 00269 /* Average bytes per second (we don't care) */ 00270 TRY_FREAD(&intval, 4, 1, infh); 00271 00272 /* Block alignment (we don't care) */ 00273 TRY_FREAD(&shortval, 2, 1, infh); 00274 00275 /* Bits per sample (must be 16) */ 00276 TRY_FREAD(&shortval, 2, 1, infh); 00277 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval); 00278 if (shortval != 16) { 00279 E_ERROR("WAVE file is not 16-bit\n"); 00280 goto error_out; 00281 } 00282 00283 /* Any extra parameters. */ 00284 if (header_len > 16) 00285 fseek(infh, header_len - 16, SEEK_CUR); 00286 00287 /* Now skip to the 'data' chunk. */ 00288 while (1) { 00289 TRY_FREAD(id, 1, 4, infh); 00290 if (0 == memcmp(id, "data", 4)) { 00291 /* Total number of bytes of data (we don't care). */ 00292 TRY_FREAD(&intval, 4, 1, infh); 00293 break; 00294 } 00295 else { 00296 /* Some other stuff... */ 00297 /* Number of bytes of ... whatever */ 00298 TRY_FREAD(&intval, 4, 1, infh); 00299 if (WORDS_BIGENDIAN) SWAP_INT32(&intval); 00300 fseek(infh, intval, SEEK_CUR); 00301 } 00302 } 00303 00304 /* We are ready to rumble. */ 00305 return 0; 00306 error_out: 00307 return -1; 00308 } 00309 00310 static int 00311 read_nist_header(FILE *infh) 00312 { 00313 char hdr[1024]; 00314 char *line, *c; 00315 00316 TRY_FREAD(hdr, 1, 1024, infh); 00317 hdr[1023] = '\0'; 00318 00319 /* Roughly parse it to find the sampling rate and byte order 00320 * (don't bother with other stuff) */ 00321 if ((line = strstr(hdr, "sample_rate")) == NULL) { 00322 E_ERROR("No sampling rate in NIST header!\n"); 00323 goto error_out; 00324 } 00325 c = strchr(line, '\n'); 00326 if (c) *c = '\0'; 00327 c = strrchr(line, ' '); 00328 if (c == NULL) { 00329 E_ERROR("Could not find sampling rate!\n"); 00330 goto error_out; 00331 } 00332 ++c; 00333 if (cmd_ln_int32("-samprate") == 0) 00334 cmd_ln_set_int32("-samprate", atoi(c)); 00335 else if (cmd_ln_int32("-samprate") != atoi(c)) { 00336 E_WARN("NIST file sampling rate %d != -samprate %d\n", 00337 atoi(c), cmd_ln_int32("-samprate")); 00338 } 00339 00340 if (line + strlen(line) < hdr + 1023) 00341 line[strlen(line)] = ' '; 00342 if ((line = strstr(hdr, "sample_byte_format")) == NULL) { 00343 E_ERROR("No sample byte format in NIST header!\n"); 00344 goto error_out; 00345 } 00346 c = strchr(line, '\n'); 00347 if (c) *c = '\0'; 00348 c = strrchr(line, ' '); 00349 if (c == NULL) { 00350 E_ERROR("Could not find sample byte order!\n"); 00351 goto error_out; 00352 } 00353 ++c; 00354 if (0 == memcmp(c, "01", 2)) { 00355 cmd_ln_set_str("-input_endian", "little"); 00356 } 00357 else if (0 == memcmp(c, "10", 2)) { 00358 cmd_ln_set_str("-input_endian", "big"); 00359 } 00360 else { 00361 E_ERROR("Unknown byte order %s\n", c); 00362 goto error_out; 00363 } 00364 00365 /* We are ready to rumble. */ 00366 return 0; 00367 error_out: 00368 return -1; 00369 } 00370 00371 static int 00372 extract_pitch(const char *in, const char *out) 00373 { 00374 FILE *infh = NULL, *outfh = NULL; 00375 size_t flen, fshift, nsamps; 00376 int16 *buf = NULL; 00377 yin_t *yin = NULL; 00378 uint16 period, bestdiff; 00379 int32 sps; 00380 00381 if (out) { 00382 if ((outfh = fopen(out, "w")) == NULL) { 00383 E_ERROR_SYSTEM("Failed to open %s for writing", out); 00384 goto error_out; 00385 } 00386 } 00387 else { 00388 outfh = stdout; 00389 } 00390 if ((infh = fopen(in, "rb")) == NULL) { 00391 E_ERROR_SYSTEM("Failed to open %s for reading", in); 00392 goto error_out; 00393 } 00394 00395 /* If we weren't told what the file type is, weakly try to 00396 * determine it (actually it's pretty obvious) */ 00397 if (!(cmd_ln_boolean("-raw") 00398 || cmd_ln_boolean("-mswav") 00399 || cmd_ln_boolean("-nist"))) { 00400 if (guess_file_type(in, infh) < 0) 00401 goto error_out; 00402 } 00403 00404 /* Grab the sampling rate and byte order from the header and also 00405 * make sure this is 16-bit linear PCM. */ 00406 if (cmd_ln_boolean("-mswav")) { 00407 if (read_riff_header(infh) < 0) 00408 goto error_out; 00409 } 00410 else if (cmd_ln_boolean("-nist")) { 00411 if (read_nist_header(infh) < 0) 00412 goto error_out; 00413 } 00414 else if (cmd_ln_boolean("-raw")) { 00415 /* Just use some defaults for sampling rate and endian. */ 00416 if (cmd_ln_str("-input_endian") == NULL) { 00417 if (WORDS_BIGENDIAN) 00418 cmd_ln_set_str("-input_endian", "big"); 00419 else 00420 cmd_ln_set_str("-input_endian", "little"); 00421 } 00422 if (cmd_ln_int32("-samprate") == 0) 00423 cmd_ln_set_int32("-samprate", 16000); 00424 } 00425 00426 /* Now read frames and write pitch estimates. */ 00427 sps = cmd_ln_int32("-samprate"); 00428 flen = (size_t)(0.5 + sps * cmd_ln_float32("-flen")); 00429 fshift = (size_t)(0.5 + sps * cmd_ln_float32("-fshift")); 00430 yin = yin_init(flen, cmd_ln_float32("-voice_thresh"), 00431 cmd_ln_float32("-search_range"), 00432 cmd_ln_int32("-smooth_window")); 00433 if (yin == NULL) { 00434 E_ERROR("Failed to initialize YIN\n"); 00435 goto error_out; 00436 } 00437 buf = ckd_calloc(flen, sizeof(*buf)); 00438 /* Read the first full frame of data. */ 00439 if (fread(buf, sizeof(*buf), flen, infh) != flen) { 00440 /* Fail silently, which is probably okay. */ 00441 } 00442 yin_start(yin); 00443 nsamps = 0; 00444 while (!feof(infh)) { 00445 /* Process a frame of data. */ 00446 yin_write(yin, buf); 00447 if (yin_read(yin, &period, &bestdiff)) { 00448 fprintf(outfh, "%.3f %.2f %.2f\n", 00449 /* Time point. */ 00450 (double)nsamps/sps, 00451 /* "Probability" of voicing. */ 00452 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768, 00453 /* Pitch (possibly bogus) */ 00454 period == 0 ? sps : (double)sps / period); 00455 nsamps += fshift; 00456 } 00457 /* Shift it back and get the next frame's overlap. */ 00458 memmove(buf, buf + fshift, (flen - fshift) * sizeof(*buf)); 00459 if (fread(buf + flen - fshift, sizeof(*buf), fshift, infh) != fshift) { 00460 /* Fail silently (FIXME: really?) */ 00461 } 00462 } 00463 yin_end(yin); 00464 /* Process trailing frames of data. */ 00465 while (yin_read(yin, &period, &bestdiff)) { 00466 fprintf(outfh, "%.3f %.2f %.2f\n", 00467 /* Time point. */ 00468 (double)nsamps/sps, 00469 /* "Probability" of voicing. */ 00470 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768, 00471 /* Pitch (possibly bogus) */ 00472 period == 0 ? sps : (double)sps / period); 00473 } 00474 00475 if (yin) 00476 yin_free(yin); 00477 ckd_free(buf); 00478 fclose(infh); 00479 if (outfh != stdout) 00480 fclose(outfh); 00481 return 0; 00482 00483 error_out: 00484 yin_free(yin); 00485 ckd_free(buf); 00486 if (infh) fclose(infh); 00487 if (outfh && outfh != stdout) fclose(outfh); 00488 return -1; 00489 } 00490 00491 static int 00492 run_control_file(const char *ctl) 00493 { 00494 FILE *ctlfh; 00495 char *line; 00496 char *di, *dout, *ei, *eio; 00497 size_t len; 00498 int rv, guess_type, guess_sps, guess_endian; 00499 int32 skip, runlen; 00500 00501 skip = cmd_ln_int32("-nskip"); 00502 runlen = cmd_ln_int32("-runlen"); 00503 00504 /* Whether to guess file types */ 00505 guess_type = !(cmd_ln_boolean("-raw") 00506 || cmd_ln_boolean("-mswav") 00507 || cmd_ln_boolean("-nist")); 00508 /* Whether to guess sampling rate */ 00509 guess_sps = (cmd_ln_int32("-samprate") == 0); 00510 /* Whether to guess endian */ 00511 guess_endian = (cmd_ln_str("-input_endian") == NULL); 00512 00513 if ((ctlfh = fopen(ctl, "r")) == NULL) { 00514 E_ERROR_SYSTEM("Failed to open control file %s", ctl); 00515 return -1; 00516 } 00517 if (cmd_ln_str("-di")) 00518 di = string_join(cmd_ln_str("-di"), "/", NULL); 00519 else 00520 di = ckd_salloc(""); 00521 if (cmd_ln_str("-do")) 00522 dout = string_join(cmd_ln_str("-do"), "/", NULL); 00523 else 00524 dout = ckd_salloc(""); 00525 if (cmd_ln_str("-ei")) 00526 ei = string_join(".", cmd_ln_str("-ei"), NULL); 00527 else 00528 ei = ckd_salloc(""); 00529 if (cmd_ln_str("-eo")) 00530 eio = string_join(".", cmd_ln_str("-eo"), NULL); 00531 else 00532 eio = ckd_salloc(""); 00533 rv = 0; 00534 while ((line = fread_line(ctlfh, &len)) != NULL) { 00535 char *infile, *outfile; 00536 00537 if (skip-- > 0) { 00538 ckd_free(line); 00539 continue; 00540 } 00541 if (runlen == 0) { 00542 ckd_free(line); 00543 break; 00544 } 00545 --runlen; 00546 00547 if (line[len-1] == '\n') 00548 line[len-1] = '\0'; 00549 00550 infile = string_join(di, line, ei, NULL); 00551 outfile = string_join(dout, line, eio, NULL); 00552 00553 /* Reset various guessed information */ 00554 if (guess_type) { 00555 cmd_ln_set_boolean("-nist", FALSE); 00556 cmd_ln_set_boolean("-mswav", FALSE); 00557 cmd_ln_set_boolean("-raw", FALSE); 00558 } 00559 if (guess_sps) 00560 cmd_ln_set_int32("-samprate", 0); 00561 if (guess_endian) 00562 cmd_ln_set_str("-input_endian", NULL); 00563 00564 rv = extract_pitch(infile, outfile); 00565 00566 ckd_free(infile); 00567 ckd_free(outfile); 00568 ckd_free(line); 00569 00570 if (rv != 0) 00571 break; 00572 } 00573 ckd_free(di); 00574 ckd_free(dout); 00575 ckd_free(ei); 00576 ckd_free(eio); 00577 fclose(ctlfh); 00578 return rv; 00579 }