• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/sphinx_lmtools/lm_eval.c

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2008 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <strfuncs.h>
00047 
00048 #include <stdio.h>
00049 #include <string.h>
00050 #include <math.h>
00051 
00052 static const arg_t defn[] = {
00053   { "-help",
00054     ARG_BOOLEAN,
00055     "no",
00056     "Shows the usage of the tool"},
00057 
00058   { "-logbase",
00059     ARG_FLOAT64,
00060     "1.0001",
00061     "Base in which all log-likelihoods calculated" },
00062 
00063   { "-lm",
00064     ARG_STRING,
00065     NULL,
00066     "Language model file"},
00067 
00068   { "-probdef",
00069     ARG_STRING,
00070     NULL,
00071     "Probability definition file for classes in LM"},
00072 
00073   { "-lmctlfn",
00074     ARG_STRING,
00075     NULL,
00076     "Control file listing a set of language models"},
00077 
00078   { "-lmname",
00079     ARG_STRING,
00080     NULL,
00081     "Name of language model in -lmctlfn to use for all utterances" },
00082 
00083   { "-lsn",
00084     ARG_STRING,
00085     NULL,
00086     "Transcription file to evaluate"},
00087 
00088   { "-text",
00089     ARG_STRING,
00090     "Text string to evaluate"},
00091 
00092   { "-mmap",
00093     ARG_BOOLEAN,
00094     "no",
00095     "Use memory-mapped I/O for reading binary LM files"},
00096 
00097   { "-lw",
00098     ARG_FLOAT32,
00099     "1.0",
00100     "Language model weight" },
00101 
00102   { "-wip",
00103     ARG_FLOAT32,
00104     "1.0",
00105     "Word insertion probability" },
00106 
00107   { "-uw",
00108     ARG_FLOAT32,
00109     "1.0",
00110     "Unigram probability weight (interpolated with uniform distribution)"},
00111 
00112   /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
00113   { NULL, 0, NULL, NULL }
00114 };
00115 
00116 static int
00117 calc_entropy(ngram_model_t *lm, char **words, int32 n,
00118              int32 *out_n_ccs, int32 *out_n_oovs)
00119 {
00120         int32 *wids;
00121         int32 startwid;
00122         int32 i, ch, nccs, noovs;
00123 
00124         /* Reverse this array into an array of word IDs. */
00125         wids = ckd_calloc(n, sizeof(*wids));
00126         for (i = 0; i < n; ++i)
00127                 wids[n-i-1] = ngram_wid(lm, words[i]);
00128         /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00129         startwid = ngram_wid(lm, "<s>");
00130 
00131         /* Now evaluate the list of words in reverse using the
00132          * remainder of the array as the history. */
00133         ch = noovs = nccs = 0;
00134         for (i = 0; i < n; ++i) {
00135                 int32 n_used;
00136                 int32 prob;
00137 
00138                 /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00139                 if (wids[i] == startwid) {
00140                         ++nccs;
00141                         continue;
00142                 }
00143                 /* Skip and count OOVs. */
00144                 if (wids[i] == NGRAM_INVALID_WID) {
00145                         ++noovs;
00146                         continue;
00147                 }
00148                 /* Sum up information for each N-gram */
00149                 prob = ngram_ng_score(lm,
00150                                       wids[i], wids + i + 1,
00151                                       n - i - 1, &n_used);
00152                 ch -= prob;
00153         }
00154 
00155         if (out_n_ccs) *out_n_ccs = nccs;
00156         if (out_n_oovs) *out_n_oovs = noovs;
00157 
00158         /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
00159         return ch / n;
00160 }
00161 
00162 static void
00163 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
00164 {
00165         FILE *fh;
00166         char line[256];
00167         int32 nccs, noovs, nwords;
00168         float64 ch, log_to_log2;;
00169 
00170         if ((fh = fopen(lsnfn, "r")) == NULL)
00171                 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
00172 
00173         /* We have to keep ch in floating-point to avoid overflows, so
00174          * we might as well use log2. */
00175         log_to_log2 = log(logmath_get_base(lmath)) / log(2);
00176         nccs = noovs = nwords = 0;
00177         ch = 0.0;
00178         while (fgets(line, sizeof(line), fh)) {
00179                 char **words;
00180                 int32 n, tmp_ch, tmp_noovs, tmp_nccs;
00181 
00182                 n = str2words(line, NULL, 0);
00183                 if (n < 0)
00184                         E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
00185                 if (n == 0) /* Do nothing! */
00186                         continue;
00187                 words = ckd_calloc(n, sizeof(*words));
00188                 str2words(line, words, n);
00189 
00190                 /* Remove any utterance ID (FIXME: has to be a single "word") */
00191                 if (words[n-1][0] == '('
00192                     && words[n-1][strlen(words[n-1])-1] == ')')
00193                         n = n - 1;
00194 
00195                 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs);
00196 
00197                 ch += (float64) tmp_ch * n * log_to_log2;
00198                 nccs += tmp_nccs;
00199                 noovs += tmp_noovs;
00200                 nwords += n;
00201                 
00202                 ckd_free(words);
00203         }
00204 
00205         ch /= nwords;
00206         printf("cross-entropy: %f bits\n", ch);
00207 
00208         /* Calculate perplexity pplx = exp CH */
00209         printf("perplexity: %f\n", pow(2.0, ch));
00210 
00211         /* Report OOVs and CCs */
00212         printf("%d words evaluated\n", nwords);
00213         printf("%d OOVs, %d context cues removed\n",
00214                noovs, nccs);
00215 }
00216 
00217 static void
00218 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
00219 {
00220         char *textfoo;
00221         char **words;
00222         int32 n, ch, noovs, nccs;
00223 
00224         /* Split it into an array of strings. */
00225         textfoo = ckd_salloc(text);
00226         n = str2words(textfoo, NULL, 0);
00227         if (n < 0)
00228                 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
00229         if (n == 0) /* Do nothing! */
00230                 return;
00231         words = ckd_calloc(n, sizeof(*words));
00232         str2words(textfoo, words, n);
00233 
00234         ch = calc_entropy(lm, words, n, &nccs, &noovs);
00235 
00236         printf("input: %s\n", text);
00237         printf("cross-entropy: %f bits\n",
00238                ch * log(logmath_get_base(lmath)) / log(2));
00239 
00240         /* Calculate perplexity pplx = exp CH */
00241         printf("perplexity: %f\n", logmath_exp(lmath, ch));
00242 
00243         /* Report OOVs and CCs */
00244         printf("%d words evaluated\n", n);
00245         printf("%d OOVs, %d context cues removed\n",
00246               noovs, nccs);
00247 
00248         ckd_free(textfoo);
00249         ckd_free(words);
00250 }
00251 
00252 int
00253 main(int argc, char *argv[])
00254 {
00255         cmd_ln_t *config;
00256         ngram_model_t *lm = NULL;
00257         logmath_t *lmath;
00258         const char *lmfn, *probdefn, *lsnfn, *text;
00259 
00260         if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00261                 return 1;
00262 
00263         /* Create log math object. */
00264         if ((lmath = logmath_init
00265              (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00266                 E_FATAL("Failed to initialize log math\n");
00267         }
00268 
00269         /* Load the language model. */
00270         lmfn = cmd_ln_str_r(config, "-lm");
00271         if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
00272             ngram_model_read_classdef(lm, probdefn);
00273         ngram_model_apply_weights(lm,
00274                                   cmd_ln_float32_r(config, "-lw"),
00275                                   cmd_ln_float32_r(config, "-wip"),
00276                                   cmd_ln_float32_r(config, "-uw"));
00277 
00278         if (lmfn == NULL
00279             || (lm = ngram_model_read(config, lmfn,
00280                                       NGRAM_AUTO, lmath)) == NULL) {
00281                 E_FATAL("Failed to load language model from %s\n",
00282                         cmd_ln_str_r(config, "-lm"));
00283         }
00284 
00285         /* Now evaluate some text. */
00286         lsnfn = cmd_ln_str_r(config, "-lsn");
00287         text = cmd_ln_str_r(config, "-text");
00288         if (lsnfn) {
00289                 evaluate_file(lm, lmath, lsnfn);
00290         }
00291         else if (text) {
00292                 evaluate_string(lm, lmath, text);
00293         }
00294 
00295         return 0;
00296 }

Generated on Fri Jan 14 2011 for SphinxBase by  doxygen 1.7.1