00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <strfuncs.h>
00047
00048 #include <stdio.h>
00049 #include <string.h>
00050 #include <math.h>
00051
00052 static const arg_t defn[] = {
00053 { "-help",
00054 ARG_BOOLEAN,
00055 "no",
00056 "Shows the usage of the tool"},
00057
00058 { "-logbase",
00059 ARG_FLOAT64,
00060 "1.0001",
00061 "Base in which all log-likelihoods calculated" },
00062
00063 { "-lm",
00064 ARG_STRING,
00065 NULL,
00066 "Language model file"},
00067
00068 { "-probdef",
00069 ARG_STRING,
00070 NULL,
00071 "Probability definition file for classes in LM"},
00072
00073 { "-lmctlfn",
00074 ARG_STRING,
00075 NULL,
00076 "Control file listing a set of language models"},
00077
00078 { "-lmname",
00079 ARG_STRING,
00080 NULL,
00081 "Name of language model in -lmctlfn to use for all utterances" },
00082
00083 { "-lsn",
00084 ARG_STRING,
00085 NULL,
00086 "Transcription file to evaluate"},
00087
00088 { "-text",
00089 ARG_STRING,
00090 "Text string to evaluate"},
00091
00092 { "-mmap",
00093 ARG_BOOLEAN,
00094 "no",
00095 "Use memory-mapped I/O for reading binary LM files"},
00096
00097 { "-lw",
00098 ARG_FLOAT32,
00099 "1.0",
00100 "Language model weight" },
00101
00102 { "-wip",
00103 ARG_FLOAT32,
00104 "1.0",
00105 "Word insertion probability" },
00106
00107 { "-uw",
00108 ARG_FLOAT32,
00109 "1.0",
00110 "Unigram probability weight (interpolated with uniform distribution)"},
00111
00112
00113 { NULL, 0, NULL, NULL }
00114 };
00115
00116 static int
00117 calc_entropy(ngram_model_t *lm, char **words, int32 n,
00118 int32 *out_n_ccs, int32 *out_n_oovs)
00119 {
00120 int32 *wids;
00121 int32 startwid;
00122 int32 i, ch, nccs, noovs;
00123
00124
00125 wids = ckd_calloc(n, sizeof(*wids));
00126 for (i = 0; i < n; ++i)
00127 wids[n-i-1] = ngram_wid(lm, words[i]);
00128
00129 startwid = ngram_wid(lm, "<s>");
00130
00131
00132
00133 ch = noovs = nccs = 0;
00134 for (i = 0; i < n; ++i) {
00135 int32 n_used;
00136 int32 prob;
00137
00138
00139 if (wids[i] == startwid) {
00140 ++nccs;
00141 continue;
00142 }
00143
00144 if (wids[i] == NGRAM_INVALID_WID) {
00145 ++noovs;
00146 continue;
00147 }
00148
00149 prob = ngram_ng_score(lm,
00150 wids[i], wids + i + 1,
00151 n - i - 1, &n_used);
00152 ch -= prob;
00153 }
00154
00155 if (out_n_ccs) *out_n_ccs = nccs;
00156 if (out_n_oovs) *out_n_oovs = noovs;
00157
00158
00159 return ch / n;
00160 }
00161
00162 static void
00163 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
00164 {
00165 FILE *fh;
00166 char line[256];
00167 int32 nccs, noovs, nwords;
00168 float64 ch, log_to_log2;;
00169
00170 if ((fh = fopen(lsnfn, "r")) == NULL)
00171 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
00172
00173
00174
00175 log_to_log2 = log(logmath_get_base(lmath)) / log(2);
00176 nccs = noovs = nwords = 0;
00177 ch = 0.0;
00178 while (fgets(line, sizeof(line), fh)) {
00179 char **words;
00180 int32 n, tmp_ch, tmp_noovs, tmp_nccs;
00181
00182 n = str2words(line, NULL, 0);
00183 if (n < 0)
00184 E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
00185 if (n == 0)
00186 continue;
00187 words = ckd_calloc(n, sizeof(*words));
00188 str2words(line, words, n);
00189
00190
00191 if (words[n-1][0] == '('
00192 && words[n-1][strlen(words[n-1])-1] == ')')
00193 n = n - 1;
00194
00195 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs);
00196
00197 ch += (float64) tmp_ch * n * log_to_log2;
00198 nccs += tmp_nccs;
00199 noovs += tmp_noovs;
00200 nwords += n;
00201
00202 ckd_free(words);
00203 }
00204
00205 ch /= nwords;
00206 printf("cross-entropy: %f bits\n", ch);
00207
00208
00209 printf("perplexity: %f\n", pow(2.0, ch));
00210
00211
00212 printf("%d words evaluated\n", nwords);
00213 printf("%d OOVs, %d context cues removed\n",
00214 noovs, nccs);
00215 }
00216
00217 static void
00218 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
00219 {
00220 char *textfoo;
00221 char **words;
00222 int32 n, ch, noovs, nccs;
00223
00224
00225 textfoo = ckd_salloc(text);
00226 n = str2words(textfoo, NULL, 0);
00227 if (n < 0)
00228 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
00229 if (n == 0)
00230 return;
00231 words = ckd_calloc(n, sizeof(*words));
00232 str2words(textfoo, words, n);
00233
00234 ch = calc_entropy(lm, words, n, &nccs, &noovs);
00235
00236 printf("input: %s\n", text);
00237 printf("cross-entropy: %f bits\n",
00238 ch * log(logmath_get_base(lmath)) / log(2));
00239
00240
00241 printf("perplexity: %f\n", logmath_exp(lmath, ch));
00242
00243
00244 printf("%d words evaluated\n", n);
00245 printf("%d OOVs, %d context cues removed\n",
00246 noovs, nccs);
00247
00248 ckd_free(textfoo);
00249 ckd_free(words);
00250 }
00251
00252 int
00253 main(int argc, char *argv[])
00254 {
00255 cmd_ln_t *config;
00256 ngram_model_t *lm = NULL;
00257 logmath_t *lmath;
00258 const char *lmfn, *probdefn, *lsnfn, *text;
00259
00260 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00261 return 1;
00262
00263
00264 if ((lmath = logmath_init
00265 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00266 E_FATAL("Failed to initialize log math\n");
00267 }
00268
00269
00270 lmfn = cmd_ln_str_r(config, "-lm");
00271 if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
00272 ngram_model_read_classdef(lm, probdefn);
00273 ngram_model_apply_weights(lm,
00274 cmd_ln_float32_r(config, "-lw"),
00275 cmd_ln_float32_r(config, "-wip"),
00276 cmd_ln_float32_r(config, "-uw"));
00277
00278 if (lmfn == NULL
00279 || (lm = ngram_model_read(config, lmfn,
00280 NGRAM_AUTO, lmath)) == NULL) {
00281 E_FATAL("Failed to load language model from %s\n",
00282 cmd_ln_str_r(config, "-lm"));
00283 }
00284
00285
00286 lsnfn = cmd_ln_str_r(config, "-lsn");
00287 text = cmd_ln_str_r(config, "-text");
00288 if (lsnfn) {
00289 evaluate_file(lm, lmath, lsnfn);
00290 }
00291 else if (text) {
00292 evaluate_string(lm, lmath, text);
00293 }
00294
00295 return 0;
00296 }