SphinxBase 0.6

include/sphinxbase/fe.h

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1996-2004 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 
00038 /*
00039  * fe.h
00040  * 
00041  * $Log: fe.h,v $
00042  * Revision 1.11  2005/02/05 02:15:02  egouvea
00043  * Removed fe_process(), never used
00044  *
00045  * Revision 1.10  2004/12/10 16:48:55  rkm
00046  * Added continuous density acoustic model handling
00047  *
00048  *
00049  */
00050 
00051 #if defined(WIN32) && !defined(GNUWINCE)
00052 #define srand48(x) srand(x)
00053 #define lrand48() rand()
00054 #endif
00055 
00056 #ifndef _NEW_FE_H_
00057 #define _NEW_FE_H_
00058 
00059 /* Win32/WinCE DLL gunk */
00060 #include <sphinxbase/sphinxbase_export.h>
00061 
00062 #include <sphinxbase/cmd_ln.h>
00063 #include <sphinxbase/fixpoint.h>
00064 
00065 #ifdef __cplusplus
00066 extern "C" {
00067 #endif
00068 #if 0
00069 /* Fool Emacs. */
00070 }
00071 #endif
00072 
00073 #ifdef WORDS_BIGENDIAN
00074 #define NATIVE_ENDIAN "big"
00075 #else
00076 #define NATIVE_ENDIAN "little"
00077 #endif
00078 
00080 #define DEFAULT_SAMPLING_RATE 16000
00081 
00082 #define DEFAULT_FRAME_RATE 100
00083 
00085 #define DEFAULT_FRAME_SHIFT 160
00086 
00087 #define DEFAULT_WINDOW_LENGTH 0.025625 
00088 
00089 #define DEFAULT_FFT_SIZE 512
00090 
00091 #define DEFAULT_NUM_CEPSTRA 13
00092 
00093 #define DEFAULT_NUM_FILTERS 40
00094 
00095 #define DEFAULT_LOWER_FILT_FREQ 133.33334
00096 
00097 #define DEFAULT_UPPER_FILT_FREQ 6855.4976
00098 
00099 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
00100 
00101 #define DEFAULT_WARP_TYPE "inverse_linear"
00102 
00103 #define SEED  -1
00104 
00105 #define waveform_to_cepstral_command_line_macro() \
00106   { "-logspec", \
00107     ARG_BOOLEAN, \
00108     "no", \
00109     "Write out logspectral files instead of cepstra" }, \
00110    \
00111   { "-smoothspec", \
00112     ARG_BOOLEAN, \
00113     "no", \
00114     "Write out cepstral-smoothed logspectral files" }, \
00115    \
00116   { "-transform", \
00117     ARG_STRING, \
00118     "legacy", \
00119     "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
00120    \
00121   { "-alpha", \
00122     ARG_FLOAT32, \
00123     ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
00124     "Preemphasis parameter" }, \
00125    \
00126   { "-samprate", \
00127     ARG_FLOAT32, \
00128     ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
00129     "Sampling rate" }, \
00130    \
00131   { "-frate", \
00132     ARG_INT32, \
00133     ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
00134     "Frame rate" }, \
00135    \
00136   { "-wlen", \
00137     ARG_FLOAT32, \
00138     ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
00139     "Hamming window length" }, \
00140    \
00141   { "-nfft", \
00142     ARG_INT32, \
00143     ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
00144     "Size of FFT" }, \
00145    \
00146   { "-nfilt", \
00147     ARG_INT32, \
00148     ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
00149     "Number of filter banks" }, \
00150    \
00151   { "-lowerf", \
00152     ARG_FLOAT32, \
00153     ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
00154     "Lower edge of filters" }, \
00155    \
00156   { "-upperf", \
00157     ARG_FLOAT32, \
00158     ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
00159     "Upper edge of filters" }, \
00160    \
00161   { "-unit_area", \
00162     ARG_BOOLEAN, \
00163     "yes", \
00164     "Normalize mel filters to unit area" }, \
00165    \
00166   { "-round_filters", \
00167     ARG_BOOLEAN, \
00168     "yes", \
00169     "Round mel filter frequencies to DFT points" }, \
00170    \
00171   { "-ncep", \
00172     ARG_INT32, \
00173     ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
00174     "Number of cep coefficients" }, \
00175    \
00176   { "-doublebw", \
00177     ARG_BOOLEAN, \
00178     "no", \
00179     "Use double bandwidth filters (same center freq)" }, \
00180    \
00181   { "-lifter", \
00182     ARG_INT32, \
00183     "0", \
00184     "Length of sin-curve for liftering, or 0 for no liftering." }, \
00185    \
00186   { "-input_endian", \
00187     ARG_STRING, \
00188     NATIVE_ENDIAN, \
00189     "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
00190    \
00191   { "-warp_type", \
00192     ARG_STRING, \
00193     DEFAULT_WARP_TYPE, \
00194     "Warping function type (or shape)" }, \
00195    \
00196   { "-warp_params", \
00197     ARG_STRING, \
00198     NULL, \
00199     "Parameters defining the warping function" }, \
00200    \
00201   { "-dither", \
00202     ARG_BOOLEAN, \
00203     "no", \
00204     "Add 1/2-bit noise" }, \
00205    \
00206   { "-seed", \
00207     ARG_INT32, \
00208     ARG_STRINGIFY(SEED), \
00209     "Seed for random number generator; if less than zero, pick our own" }, \
00210    \
00211   { "-remove_dc", \
00212     ARG_BOOLEAN, \
00213     "no", \
00214     "Remove DC offset from each frame" }, \
00215                                           \
00216   { "-verbose", \
00217     ARG_BOOLEAN, \
00218     "no", \
00219     "Show input filenames" } \
00220   
00221   
00222 #ifdef FIXED_POINT
00223 
00224 typedef fixed32 mfcc_t;
00225 
00227 #define FLOAT2MFCC(x) FLOAT2FIX(x)
00228 
00229 #define MFCC2FLOAT(x) FIX2FLOAT(x)
00230 
00231 #define MFCCMUL(a,b) FIXMUL(a,b)
00232 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
00233 #else /* !FIXED_POINT */
00234 
00236 typedef float32 mfcc_t;
00238 #define FLOAT2MFCC(x) (x)
00239 
00240 #define MFCC2FLOAT(x) (x)
00241 
00242 #define MFCCMUL(a,b) ((a)*(b))
00243 #define MFCCLN(x,in,out) log(x)
00244 #endif /* !FIXED_POINT */
00245 
00249 typedef struct fe_s fe_t;
00250 
00254 enum fe_error_e {
00255         FE_SUCCESS = 0,
00256         FE_OUTPUT_FILE_SUCCESS  = 0,
00257         FE_CONTROL_FILE_ERROR = -1,
00258         FE_START_ERROR = -2,
00259         FE_UNKNOWN_SINGLE_OR_BATCH = -3,
00260         FE_INPUT_FILE_OPEN_ERROR = -4,
00261         FE_INPUT_FILE_READ_ERROR = -5,
00262         FE_MEM_ALLOC_ERROR = -6,
00263         FE_OUTPUT_FILE_WRITE_ERROR = -7,
00264         FE_OUTPUT_FILE_OPEN_ERROR = -8,
00265         FE_ZERO_ENERGY_ERROR = -9,
00266         FE_INVALID_PARAM_ERROR =  -10
00267 };
00268 
00276 SPHINXBASE_EXPORT
00277 fe_t* fe_init_auto(void);
00278 
00286 SPHINXBASE_EXPORT
00287 arg_t const *fe_get_args(void);
00288 
00299 SPHINXBASE_EXPORT
00300 fe_t *fe_init_auto_r(cmd_ln_t *config);
00301 
00309 SPHINXBASE_EXPORT
00310 cmd_ln_t *fe_get_config(fe_t *fe);
00311 
00316 SPHINXBASE_EXPORT
00317 int fe_start_utt(fe_t *fe);
00318 
00331 SPHINXBASE_EXPORT
00332 int fe_get_output_size(fe_t *fe);
00333 
00346 SPHINXBASE_EXPORT
00347 void fe_get_input_size(fe_t *fe, int *out_frame_shift,
00348                        int *out_frame_size);
00349 
00364 SPHINXBASE_EXPORT
00365 int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
00366 
00372 SPHINXBASE_EXPORT
00373 fe_t *fe_retain(fe_t *fe);
00374 
00382 SPHINXBASE_EXPORT
00383 int fe_free(fe_t *fe);
00384 
00393 SPHINXBASE_EXPORT
00394 int fe_process_frame(fe_t *fe, int16 const *spch,
00395                      int32 nsamps, mfcc_t *out_cep);
00396 
00444 SPHINXBASE_EXPORT
00445 int fe_process_frames(fe_t *fe,
00446                       int16 const **inout_spch,
00447                       size_t *inout_nsamps,
00448                       mfcc_t **buf_cep,
00449                       int32 *inout_nframes);
00450 
00466 SPHINXBASE_EXPORT
00467 int fe_process_utt(fe_t *fe,  
00468                    int16 const *spch, 
00469                    size_t nsamps, 
00470                    mfcc_t ***cep_block, 
00471                    int32 *nframes 
00472         );
00473 
00477 SPHINXBASE_EXPORT
00478 void fe_free_2d(void *arr);
00479 
00483 SPHINXBASE_EXPORT
00484 int fe_mfcc_to_float(fe_t *fe,
00485                      mfcc_t **input,
00486                      float32 **output,
00487                      int32 nframes);
00488 
00492 SPHINXBASE_EXPORT
00493 int fe_float_to_mfcc(fe_t *fe,
00494                      float32 **input,
00495                      mfcc_t **output,
00496                      int32 nframes);
00497 
00521 SPHINXBASE_EXPORT
00522 int fe_logspec_to_mfcc(fe_t *fe,  
00523                        const mfcc_t *fr_spec, 
00524                        mfcc_t *fr_cep 
00525         );
00526 
00535 SPHINXBASE_EXPORT
00536 int fe_logspec_dct2(fe_t *fe,  
00537                     const mfcc_t *fr_spec, 
00538                     mfcc_t *fr_cep 
00539         );
00540 
00549 SPHINXBASE_EXPORT
00550 int fe_mfcc_dct3(fe_t *fe,  
00551                  const mfcc_t *fr_cep, 
00552                  mfcc_t *fr_spec 
00553         );
00554 
00555 #ifdef __cplusplus
00556 }
00557 #endif
00558 
00559 
00560 #endif