• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/libsphinxbase/lm/ngram_model.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model.c N-Gram language models.
00039  *
00040  * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
00041  */
00042 
00043 #include "config.h"
00044 #include "ngram_model.h"
00045 #include "ngram_model_internal.h"
00046 #include "ckd_alloc.h"
00047 #include "filename.h"
00048 #include "pio.h"
00049 #include "err.h"
00050 #include "logmath.h"
00051 #include "strfuncs.h"
00052 #include "case.h"
00053 
00054 #include <string.h>
00055 #include <assert.h>
00056 #ifdef HAVE_ICONV
00057 #include <iconv.h>
00058 #endif 
00059 
00060 ngram_file_type_t
00061 ngram_file_name_to_type(const char *file_name)
00062 {
00063     const char *ext;
00064 
00065     ext = strrchr(file_name, '.');
00066     if (ext == NULL) {
00067         return NGRAM_ARPA; /* Default file type */
00068     }
00069     if (0 == strcmp_nocase(ext, ".gz")) {
00070         while (--ext >= file_name) {
00071             if (*ext == '.') break;
00072         }
00073         if (ext < file_name) {
00074             return NGRAM_ARPA; /* Default file type */
00075         }
00076     }
00077     /* We use strncmp because there might be a .gz on the end. */
00078     if (0 == strncmp_nocase(ext, ".ARPA", 5))
00079         return NGRAM_ARPA;
00080     if (0 == strncmp_nocase(ext, ".DMP32", 6))
00081         return NGRAM_DMP32;
00082     if (0 == strncmp_nocase(ext, ".DMP", 4))
00083         return NGRAM_DMP;
00084     return NGRAM_ARPA; /* Default file type */
00085 }
00086 
00087 ngram_model_t *
00088 ngram_model_read(cmd_ln_t *config,
00089                  const char *file_name,
00090                  ngram_file_type_t file_type,
00091                  logmath_t *lmath)
00092 {
00093     ngram_model_t *model = NULL;
00094 
00095     switch (file_type) {
00096     case NGRAM_AUTO: {
00097         if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00098             break;
00099         if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00100             break;
00101         if ((model = ngram_model_dmp32_read(config, file_name, lmath)) != NULL)
00102             break;
00103         return NULL;
00104     }
00105     case NGRAM_ARPA:
00106         model = ngram_model_arpa_read(config, file_name, lmath);
00107         break;
00108     case NGRAM_DMP:
00109         model = ngram_model_dmp_read(config, file_name, lmath);
00110         break;
00111     case NGRAM_DMP32:
00112         model = ngram_model_dmp32_read(config, file_name, lmath);
00113         break;
00114     }
00115 
00116     /* Now set weights based on config if present. */
00117     if (config) {
00118         float32 lw = 1.0;
00119         float32 wip = 1.0;
00120         float32 uw = 1.0;
00121 
00122         if (cmd_ln_exists_r(config, "-lw"))
00123             lw = cmd_ln_float32_r(config, "-lw");
00124         if (cmd_ln_exists_r(config, "-wip"))
00125             wip = cmd_ln_float32_r(config, "-wip");
00126         if (cmd_ln_exists_r(config, "-uw"))
00127             uw = cmd_ln_float32_r(config, "-uw");
00128 
00129         ngram_model_apply_weights(model, lw, wip, uw);
00130     }
00131 
00132     return model;
00133 }
00134 
00135 int
00136 ngram_model_write(ngram_model_t *model, const char *file_name,
00137                   ngram_file_type_t file_type)
00138 {
00139     switch (file_type) {
00140     case NGRAM_AUTO: {
00141         file_type = ngram_file_name_to_type(file_name);
00142         return ngram_model_write(model, file_name, file_type);
00143     }
00144     case NGRAM_ARPA:
00145         return ngram_model_arpa_write(model, file_name);
00146     case NGRAM_DMP:
00147         return ngram_model_dmp_write(model, file_name);
00148     case NGRAM_DMP32:
00149         return ngram_model_dmp32_write(model, file_name);
00150     }
00151 
00152     return -1; /* In case your compiler is really stupid. */
00153 }
00154 
00155 int32
00156 ngram_model_init(ngram_model_t *base,
00157                  ngram_funcs_t *funcs,
00158                  logmath_t *lmath,
00159                  int32 n, int32 n_unigram)
00160 {
00161     base->refcount = 1;
00162     base->funcs = funcs;
00163     base->n = n;
00164     /* If this was previously initialized... */
00165     if (base->n_counts == NULL)
00166         base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00167     /* Don't reset weights if logmath object hasn't changed. */
00168     if (base->lmath != lmath) {
00169         /* Set default values for weights. */
00170         base->lw = 1.0;
00171         base->log_wip = 0; /* i.e. 1.0 */
00172         base->log_uw = 0;  /* i.e. 1.0 */
00173         base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00174         base->log_uniform_weight = logmath_get_zero(lmath);
00175         base->log_zero = logmath_get_zero(lmath);
00176         base->lmath = lmath;
00177     }
00178     /* Allocate or reallocate space for word strings. */
00179     if (base->word_str) {
00180         /* Free all previous word strings if they were allocated. */
00181         if (base->writable) {
00182             int32 i;
00183             for (i = 0; i < base->n_words; ++i) {
00184                 ckd_free(base->word_str[i]);
00185                 base->word_str[i] = NULL;
00186             }
00187         }
00188         base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00189     }
00190     else
00191         base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00192     /* NOTE: They are no longer case-insensitive since we are allowing
00193      * other encodings for word strings.  Beware. */
00194     if (base->wid)
00195         hash_table_empty(base->wid);
00196     else
00197         base->wid = hash_table_new(n_unigram, FALSE);
00198     base->n_1g_alloc = base->n_words = n_unigram;
00199 
00200     return 0;
00201 }
00202 
00203 ngram_model_t *
00204 ngram_model_retain(ngram_model_t *model)
00205 {
00206     ++model->refcount;
00207     return model;
00208 }
00209 
00210 
00211 void
00212 ngram_model_flush(ngram_model_t *model)
00213 {
00214     if (model->funcs && model->funcs->flush)
00215         (*model->funcs->flush)(model);
00216 }
00217 
00218 int
00219 ngram_model_free(ngram_model_t *model)
00220 {
00221     int i;
00222 
00223     if (model == NULL)
00224         return 0;
00225     if (--model->refcount > 0)
00226         return model->refcount;
00227     if (model->funcs && model->funcs->free)
00228         (*model->funcs->free)(model);
00229     if (model->writable) {
00230         /* Free all words. */
00231         for (i = 0; i < model->n_words; ++i) {
00232             ckd_free(model->word_str[i]);
00233         }
00234     }
00235     else {
00236         /* Free all class words. */
00237         for (i = 0; i < model->n_classes; ++i) {
00238             ngram_class_t *lmclass;
00239             int32 j;
00240 
00241             lmclass = model->classes[i];
00242             for (j = 0; j < lmclass->n_words; ++j) {
00243                 ckd_free(model->word_str[lmclass->start_wid + j]);
00244             }
00245             for (j = 0; j < lmclass->n_hash; ++j) {
00246                 if (lmclass->nword_hash[j].wid != -1) {
00247                     ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00248                 }
00249             }
00250         }
00251     }
00252     for (i = 0; i < model->n_classes; ++i) {
00253         ngram_class_free(model->classes[i]);
00254     }
00255     ckd_free(model->classes);
00256     hash_table_free(model->wid);
00257     ckd_free(model->word_str);
00258     ckd_free(model->n_counts);
00259     ckd_free(model);
00260     return 0;
00261 }
00262 
00263 
00264 #ifdef HAVE_ICONV
00265 int
00266 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00267 {
00268     iconv_t ic;
00269     char *outbuf;
00270     size_t maxlen;
00271     int i, writable;
00272     hash_table_t *new_wid;
00273 
00274     /* FIXME: Need to do a special case thing for the GB-HEX encoding
00275      * used in Sphinx3 Mandarin models. */
00276     if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00277         E_ERROR_SYSTEM("iconv_open() failed");
00278         return -1;
00279     }
00280     /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
00281      * unlike wcstombs(3). So we have to either call it over and over
00282      * again until our buffer is big enough, or call it with a huge
00283      * buffer and then copy things back to the output.  We will use a
00284      * mix of these two approaches here.  We'll keep a single big
00285      * buffer around, and expand it as necessary.
00286      */
00287     maxlen = 0;
00288     for (i = 0; i < model->n_words; ++i) {
00289         if (strlen(model->word_str[i]) > maxlen)
00290             maxlen = strlen(model->word_str[i]);
00291     }
00292     /* Were word strings already allocated? */
00293     writable = model->writable;
00294     /* Either way, we are going to allocate some word strings. */
00295     model->writable = TRUE;
00296     /* Really should be big enough except for pathological cases. */
00297     maxlen = maxlen * sizeof(int) + 15;
00298     outbuf = ckd_calloc(maxlen, 1);
00299     /* And, don't forget, we need to rebuild the word to unigram ID
00300      * mapping. */
00301     new_wid = hash_table_new(model->n_words, FALSE);
00302     for (i = 0; i < model->n_words; ++i) {
00303         ICONV_CONST char *in;
00304         char *out;
00305         size_t inleft, outleft, result;
00306 
00307     start_conversion:
00308         in = (ICONV_CONST char *)model->word_str[i];
00309         /* Yes, this assumes that we don't have any NUL bytes. */
00310         inleft = strlen(in);
00311         out = outbuf;
00312         outleft = maxlen;
00313 
00314         while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00315             if (errno != E2BIG) {
00316                 /* FIXME: if we already converted any words, then they
00317                  * are going to be in an inconsistent state. */
00318                 E_ERROR_SYSTEM("iconv() failed");
00319                 ckd_free(outbuf);
00320                 hash_table_free(new_wid);
00321                 return -1;
00322             }
00323             /* Reset the internal state of conversion. */
00324             iconv(ic, NULL, NULL, NULL, NULL);
00325             /* Make everything bigger. */
00326             maxlen *= 2;
00327             out = outbuf = ckd_realloc(outbuf, maxlen);
00328             /* Reset the input pointers. */
00329             in = (ICONV_CONST char *)model->word_str[i];
00330             inleft = strlen(in);
00331         }
00332 
00333         /* Now flush a shift-out sequence, if any. */
00334         if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00335             if (errno != E2BIG) {
00336                 /* FIXME: if we already converted any words, then they
00337                  * are going to be in an inconsistent state. */
00338                 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00339                 ckd_free(outbuf);
00340                 hash_table_free(new_wid);
00341                 return -1;
00342             }
00343             /* Reset the internal state of conversion. */
00344             iconv(ic, NULL, NULL, NULL, NULL);
00345             /* Make everything bigger. */
00346             maxlen *= 2;
00347             outbuf = ckd_realloc(outbuf, maxlen);
00348             /* Be very evil. */
00349             goto start_conversion;
00350         }
00351 
00352         result = maxlen - outleft;
00353         /* Okay, that was hard, now let's go shopping. */
00354         if (writable) {
00355             /* Grow or shrink the output string as necessary. */
00356             model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00357             model->word_str[i][result] = '\0';
00358         }
00359         else {
00360             /* It actually was not allocated previously, so do that now. */
00361             model->word_str[i] = ckd_calloc(result + 1, 1);
00362         }
00363         /* Copy the new thing in. */
00364         memcpy(model->word_str[i], outbuf, result);
00365 
00366         /* Now update the hash table.  We might have terrible
00367          * collisions if a non-reversible conversion was requested.,
00368          * so warn about them. */
00369         if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00370             E_WARN("Duplicate word in dictionary after conversion: %s\n",
00371                    model->word_str[i]);
00372         }
00373     }
00374     ckd_free(outbuf);
00375     iconv_close(ic);
00376     /* Swap out the hash table. */
00377     hash_table_free(model->wid);
00378     model->wid = new_wid;
00379 
00380     return 0;
00381 }
00382 #else /* !HAVE_ICONV */
00383 int
00384 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00385 {
00386     return -1;
00387 }
00388 #endif /* !HAVE_ICONV */
00389 
00390 int
00391 ngram_model_apply_weights(ngram_model_t *model,
00392                           float32 lw, float32 wip, float32 uw)
00393 {
00394     return (*model->funcs->apply_weights)(model, lw, wip, uw);
00395 }
00396 
00397 float32
00398 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00399                         int32 *out_log_uw)
00400 {
00401     if (out_log_wip) *out_log_wip = model->log_wip;
00402     if (out_log_uw) *out_log_uw = model->log_uw;
00403     return model->lw;
00404 }
00405 
00406 
00407 int32
00408 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00409                int32 n_hist, int32 *n_used)
00410 {
00411     int32 score, class_weight = 0;
00412     int i;
00413 
00414     /* Closed vocabulary, OOV word probability is zero */
00415     if (wid == NGRAM_INVALID_WID)
00416         return model->log_zero;
00417 
00418     /* "Declassify" wid and history */
00419     if (NGRAM_IS_CLASSWID(wid)) {
00420         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00421 
00422         class_weight = ngram_class_prob(lmclass, wid);
00423         if (class_weight == 0) /* Yes, this is correct, because
00424                                 * log_zero is not available to
00425                                 * ngram_class_prob() */
00426             return model->log_zero;
00427         wid = lmclass->tag_wid;
00428     }
00429     for (i = 0; i < n_hist; ++i) {
00430         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00431             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00432     }
00433     score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00434 
00435     /* Multiply by unigram in-class weight. */
00436     return score + class_weight;
00437 }
00438 
00439 int32
00440 ngram_score(ngram_model_t *model, const char *word, ...)
00441 {
00442     va_list history;
00443     const char *hword;
00444     int32 *histid;
00445     int32 n_hist;
00446     int32 n_used;
00447     int32 prob;
00448 
00449     va_start(history, word);
00450     n_hist = 0;
00451     while ((hword = va_arg(history, const char *)) != NULL)
00452         ++n_hist;
00453     va_end(history);
00454 
00455     histid = ckd_calloc(n_hist, sizeof(*histid));
00456     va_start(history, word);
00457     n_hist = 0;
00458     while ((hword = va_arg(history, const char *)) != NULL) {
00459         histid[n_hist] = ngram_wid(model, hword);
00460         ++n_hist;
00461     }
00462     va_end(history);
00463 
00464     prob = ngram_ng_score(model, ngram_wid(model, word),
00465                           histid, n_hist, &n_used);
00466     ckd_free(histid);
00467     return prob;
00468 }
00469 
00470 int32
00471 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00472 {
00473     int32 hist[2] = { w2, w1 };
00474     return ngram_ng_score(model, w3, hist, 2, n_used);
00475 }
00476 
00477 int32
00478 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00479 {
00480     return ngram_ng_score(model, w2, &w1, 1, n_used);
00481 }
00482 
00483 int32
00484 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00485               int32 n_hist, int32 *n_used)
00486 {
00487     int32 prob, class_weight = 0;
00488     int i;
00489 
00490     /* Closed vocabulary, OOV word probability is zero */
00491     if (wid == NGRAM_INVALID_WID)
00492         return model->log_zero;
00493 
00494     /* "Declassify" wid and history */
00495     if (NGRAM_IS_CLASSWID(wid)) {
00496         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00497 
00498         class_weight = ngram_class_prob(lmclass, wid);
00499         if (class_weight == model->log_zero)
00500             return class_weight;
00501         wid = lmclass->tag_wid;
00502     }
00503     for (i = 0; i < n_hist; ++i) {
00504         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00505             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00506     }
00507     prob = (*model->funcs->raw_score)(model, wid, history,
00508                                       n_hist, n_used);
00509     /* Multiply by unigram in-class weight. */
00510     return prob + class_weight;
00511 }
00512 
00513 int32
00514 ngram_prob(ngram_model_t *model, const char *word, ...)
00515 {
00516     va_list history;
00517     const char *hword;
00518     int32 *histid;
00519     int32 n_hist;
00520     int32 n_used;
00521     int32 prob;
00522 
00523     va_start(history, word);
00524     n_hist = 0;
00525     while ((hword = va_arg(history, const char *)) != NULL)
00526         ++n_hist;
00527     va_end(history);
00528 
00529     histid = ckd_calloc(n_hist, sizeof(*histid));
00530     va_start(history, word);
00531     n_hist = 0;
00532     while ((hword = va_arg(history, const char *)) != NULL) {
00533         histid[n_hist] = ngram_wid(model, hword);
00534         ++n_hist;
00535     }
00536     va_end(history);
00537 
00538     prob = ngram_ng_prob(model, ngram_wid(model, word),
00539                          histid, n_hist, &n_used);
00540     ckd_free(histid);
00541     return prob;
00542 }
00543 
00544 int32
00545 ngram_score_to_prob(ngram_model_t *base, int32 score)
00546 {
00547     int32 prob;
00548 
00549     /* Undo insertion penalty. */
00550     prob = score - base->log_wip;
00551     /* Undo language weight. */
00552     prob = (int32)(prob / base->lw);
00553 
00554     return prob;
00555 }
00556 
00557 int32
00558 ngram_unknown_wid(ngram_model_t *model)
00559 {
00560     int32 val;
00561 
00562     /* FIXME: This could be memoized for speed if necessary. */
00563     /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
00564     if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00565         return NGRAM_INVALID_WID;
00566     else
00567         return val;
00568 }
00569 
00570 int32
00571 ngram_zero(ngram_model_t *model)
00572 {
00573     return model->log_zero;
00574 }
00575 
00576 int32
00577 ngram_model_get_size(ngram_model_t *model)
00578 {
00579   if (model != NULL)
00580     return model->n;
00581   return 0;
00582 }
00583 
00584 int32 const *
00585 ngram_model_get_counts(ngram_model_t *model)
00586 {
00587   if (model != NULL)
00588     return model->n_counts;
00589   return NULL;
00590 }
00591 
00592 int32
00593 ngram_wid(ngram_model_t *model, const char *word)
00594 {
00595     int32 val;
00596 
00597     if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00598         return ngram_unknown_wid(model);
00599     else
00600         return val;
00601 }
00602 
00603 const char *
00604 ngram_word(ngram_model_t *model, int32 wid)
00605 {
00606     /* Remove any class tag */
00607     wid = NGRAM_BASEWID(wid);
00608     if (wid >= model->n_words)
00609         return NULL;
00610     return model->word_str[wid];
00611 }
00612 
00616 int32
00617 ngram_add_word_internal(ngram_model_t *model,
00618                         const char *word,
00619                         int32 classid)
00620 {
00621     void *dummy;
00622     int32 wid;
00623 
00624     /* Take the next available word ID */
00625     wid = model->n_words;
00626     if (classid >= 0) {
00627         wid = NGRAM_CLASSWID(wid, classid);
00628     }
00629     /* Check for hash collisions. */
00630     if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00631         E_ERROR("Duplicate definition of word %s\n", word);
00632         return NGRAM_INVALID_WID;
00633     }
00634     /* Reallocate word_str if necessary. */
00635     if (model->n_words >= model->n_1g_alloc) {
00636         model->n_1g_alloc += UG_ALLOC_STEP;
00637         model->word_str = ckd_realloc(model->word_str,
00638                                       sizeof(*model->word_str) * model->n_1g_alloc);
00639     }
00640     /* Add the word string in the appropriate manner. */
00641     /* Class words are always dynamically allocated. */
00642     model->word_str[model->n_words] = ckd_salloc(word);
00643     /* Now enter it into the hash table. */
00644     if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00645         E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00646                 model->word_str[model->n_words], (void *)(long)(wid));
00647     }
00648     /* Increment number of words. */
00649     ++model->n_words;
00650     return wid;
00651 }
00652 
00653 int32
00654 ngram_model_add_word(ngram_model_t *model,
00655                      const char *word, float32 weight)
00656 {
00657     int32 wid, prob = model->log_zero;
00658 
00659     wid = ngram_add_word_internal(model, word, -1);
00660     if (wid == NGRAM_INVALID_WID)
00661         return wid;
00662 
00663     /* Do what needs to be done to add the word to the unigram. */
00664     if (model->funcs && model->funcs->add_ug)
00665         prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00666     if (prob == 0) {
00667         if (model->writable)
00668             ckd_free(model->word_str[wid]);
00669         return -1;
00670     }
00671     return wid;
00672 }
00673 
00674 ngram_class_t *
00675 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00676 {
00677     ngram_class_t *lmclass;
00678     gnode_t *gn;
00679     float32 tprob;
00680     int i;
00681 
00682     lmclass = ckd_calloc(1, sizeof(*lmclass));
00683     lmclass->tag_wid = tag_wid;
00684     /* wid_base is the wid (minus class tag) of the first word in the list. */
00685     lmclass->start_wid = start_wid;
00686     lmclass->n_words = glist_count(classwords);
00687     lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00688     lmclass->nword_hash = NULL;
00689     lmclass->n_hash = 0;
00690     tprob = 0.0;
00691     for (gn = classwords; gn; gn = gnode_next(gn)) {
00692         tprob += gnode_float32(gn);
00693     }
00694     if (tprob > 1.1 || tprob < 0.9) {
00695         E_WARN("Total class probability is %f, will normalize\n", tprob);
00696         for (gn = classwords; gn; gn = gnode_next(gn)) {
00697             gn->data.fl /= tprob;
00698         }
00699     }
00700     for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00701         lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00702     }
00703 
00704     return lmclass;
00705 }
00706 
00707 int32
00708 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00709 {
00710     int32 hash;
00711 
00712     if (lmclass->nword_hash == NULL) {
00713         /* Initialize everything in it to -1 */
00714         lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00715         memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00716         lmclass->n_hash = NGRAM_HASH_SIZE;
00717         lmclass->n_hash_inuse = 0;
00718     }
00719     /* Stupidest possible hash function.  This will work pretty well
00720      * when this function is called repeatedly with contiguous word
00721      * IDs, though... */
00722     hash = wid & (lmclass->n_hash - 1);
00723     if (lmclass->nword_hash[hash].wid == -1) {
00724         /* Good, no collision. */
00725         lmclass->nword_hash[hash].wid = wid;
00726         lmclass->nword_hash[hash].prob1 = lweight;
00727         ++lmclass->n_hash_inuse;
00728         return hash;
00729     }
00730     else {
00731         int32 next; 
00732         /* Collision... Find the end of the hash chain. */
00733         while (lmclass->nword_hash[hash].next != -1)
00734             hash = lmclass->nword_hash[hash].next;
00735         assert(hash != -1);
00736         /* Does we has any more bukkit? */
00737         if (lmclass->n_hash_inuse == lmclass->n_hash) {
00738             /* Oh noes!  Ok, we makes more. */
00739             lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, 
00740                                               lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00741             memset(lmclass->nword_hash + lmclass->n_hash,
00742                    0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00743             /* Just use the next allocated one (easy) */
00744             next = lmclass->n_hash;
00745             lmclass->n_hash *= 2;
00746         }
00747         else {
00748             /* Look for any available bucket.  We hope this doesn't happen. */
00749             for (next = 0; next < lmclass->n_hash; ++next)
00750                 if (lmclass->nword_hash[next].wid == -1)
00751                     break;
00752             /* This should absolutely not happen. */
00753             assert(next != lmclass->n_hash);
00754         }
00755         lmclass->nword_hash[next].wid = wid;
00756         lmclass->nword_hash[next].prob1 = lweight;
00757         lmclass->nword_hash[hash].next = next;
00758         ++lmclass->n_hash_inuse;
00759         return next;
00760     }
00761 }
00762 
00763 void
00764 ngram_class_free(ngram_class_t *lmclass)
00765 {
00766     ckd_free(lmclass->nword_hash);
00767     ckd_free(lmclass->prob1);
00768     ckd_free(lmclass);
00769 }
00770 
00771 int32
00772 ngram_model_add_class_word(ngram_model_t *model,
00773                            const char *classname,
00774                            const char *word,
00775                            float32 weight)
00776 {
00777     ngram_class_t *lmclass;
00778     int32 classid, tag_wid, wid, i, scale;
00779     float32 fprob;
00780 
00781     /* Find the class corresponding to classname.  Linear search
00782      * probably okay here since there won't be very many classes, and
00783      * this doesn't have to be fast. */
00784     tag_wid = ngram_wid(model, classname);
00785     if (tag_wid == NGRAM_INVALID_WID) {
00786         E_ERROR("No such word or class tag: %s\n", classname);
00787         return tag_wid;
00788     }
00789     for (classid = 0; classid < model->n_classes; ++classid) {
00790         if (model->classes[classid]->tag_wid == tag_wid)
00791             break;
00792     }
00793     /* Hmm, no such class.  It's probably not a good idea to create one. */
00794     if (classid == model->n_classes) {
00795         E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00796         return NGRAM_INVALID_WID;
00797     }
00798     lmclass = model->classes[classid];
00799 
00800     /* Add this word to the model's set of words. */
00801     wid = ngram_add_word_internal(model, word, classid);
00802     if (wid == NGRAM_INVALID_WID)
00803         return wid;
00804 
00805     /* This is the fixed probability of the new word. */
00806     fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00807     /* Now normalize everything else to fit it in.  This is
00808      * accomplished by simply scaling all the other probabilities
00809      * by (1-fprob). */
00810     scale = logmath_log(model->lmath, 1.0 - fprob);
00811     for (i = 0; i < lmclass->n_words; ++i)
00812         lmclass->prob1[i] += scale;
00813     for (i = 0; i < lmclass->n_hash; ++i)
00814         if (lmclass->nword_hash[i].wid != -1)
00815             lmclass->nword_hash[i].prob1 += scale;
00816 
00817     /* Now add it to the class hash table. */
00818     return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00819 }
00820 
00821 int32
00822 ngram_model_add_class(ngram_model_t *model,
00823                       const char *classname,
00824                       float32 classweight,
00825                       char **words,
00826                       const float32 *weights,
00827                       int32 n_words)
00828 {
00829     ngram_class_t *lmclass;
00830     glist_t classwords = NULL;
00831     int32 i, start_wid = -1;
00832     int32 classid, tag_wid;
00833 
00834     /* Check if classname already exists in model.  If not, add it.*/
00835     if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
00836         tag_wid = ngram_model_add_word(model, classname, classweight);
00837         if (tag_wid == NGRAM_INVALID_WID)
00838             return -1;
00839     }
00840 
00841     if (model->n_classes == 128) {
00842         E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
00843         return -1;
00844     }
00845     classid = model->n_classes;
00846     for (i = 0; i < n_words; ++i) {
00847         int32 wid;
00848 
00849         wid = ngram_add_word_internal(model, words[i], classid);
00850         if (wid == NGRAM_INVALID_WID)
00851             return -1;
00852         if (start_wid == -1)
00853             start_wid = NGRAM_BASEWID(wid);
00854         classwords = glist_add_float32(classwords, weights[i]);
00855     }
00856     classwords = glist_reverse(classwords);
00857     lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
00858     glist_free(classwords);
00859     if (lmclass == NULL)
00860         return -1;
00861 
00862     ++model->n_classes;
00863     if (model->classes == NULL)
00864         model->classes = ckd_calloc(1, sizeof(*model->classes));
00865     else
00866         model->classes = ckd_realloc(model->classes,
00867                                      model->n_classes * sizeof(*model->classes));
00868     model->classes[classid] = lmclass;
00869     return classid;
00870 }
00871 
00872 int32
00873 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
00874 {
00875     int32 base_wid = NGRAM_BASEWID(wid);
00876 
00877     if (base_wid < lmclass->start_wid
00878         || base_wid > lmclass->start_wid + lmclass->n_words) {
00879         int32 hash;
00880 
00881         /* Look it up in the hash table. */
00882         hash = wid & (lmclass->n_hash - 1);
00883         while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
00884             hash = lmclass->nword_hash[hash].next;
00885         if (hash == -1)
00886             return 0;
00887         return lmclass->nword_hash[hash].prob1;
00888     }
00889     else {
00890         return lmclass->prob1[base_wid - lmclass->start_wid];
00891     }
00892 }
00893 
00894 int32
00895 read_classdef_file(hash_table_t *classes, const char *file_name)
00896 {
00897     FILE *fp;
00898     int32 is_pipe;
00899     int inclass;  
00900     int32 rv = -1;
00901     gnode_t *gn;
00902     glist_t classwords = NULL;
00903     glist_t classprobs = NULL;
00904     char *classname = NULL;
00905 
00906     if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
00907         E_ERROR("File %s not found\n", file_name);
00908         return -1;
00909     }
00910 
00911     inclass = FALSE;
00912     while (!feof(fp)) {
00913         char line[512];
00914         char *wptr[2];
00915         int n_words;
00916 
00917         if (fgets(line, sizeof(line), fp) == NULL)
00918             break;
00919 
00920         n_words = str2words(line, wptr, 2);
00921         if (n_words <= 0)
00922             continue;
00923 
00924         if (inclass) {
00925             /* Look for an end of class marker. */
00926             if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
00927                 classdef_t *classdef;
00928                 gnode_t *word, *weight;
00929                 int32 i;
00930 
00931                 if (classname == NULL || 0 != strcmp(wptr[1], classname))
00932                     goto error_out;
00933                 inclass = FALSE;
00934 
00935                 /* Construct a class from the list of words collected. */
00936                 classdef = ckd_calloc(1, sizeof(*classdef));
00937                 classwords = glist_reverse(classwords);
00938                 classprobs = glist_reverse(classprobs);
00939                 classdef->n_words = glist_count(classwords);
00940                 classdef->words = ckd_calloc(classdef->n_words,
00941                                              sizeof(*classdef->words));
00942                 classdef->weights = ckd_calloc(classdef->n_words,
00943                                                sizeof(*classdef->weights));
00944                 word = classwords;
00945                 weight = classprobs;
00946                 for (i = 0; i < classdef->n_words; ++i) {
00947                     classdef->words[i] = gnode_ptr(word);
00948                     classdef->weights[i] = gnode_float32(weight);
00949                     word = gnode_next(word);
00950                     weight = gnode_next(weight);
00951                 }
00952                 
00953                 /* Add this class to the hash table. */
00954                 if (hash_table_enter(classes, classname, classdef) != classdef) {
00955                     classdef_free(classdef);
00956                     goto error_out;
00957                 }
00958 
00959                 /* Reset everything. */
00960                 glist_free(classwords);
00961                 glist_free(classprobs);
00962                 classwords = NULL;
00963                 classprobs = NULL;
00964                 classname = NULL;
00965             }
00966             else {
00967                 float32 fprob;
00968 
00969                 if (n_words == 2)
00970                     fprob = (float32)atof_c(wptr[1]);
00971                 else
00972                     fprob = 1.0f;
00973                 /* Add it to the list of words for this class. */
00974                 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
00975                 classprobs = glist_add_float32(classprobs, fprob);
00976             }
00977         }
00978         else {
00979             /* Start a new LM class if the LMCLASS marker is seen */
00980             if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
00981                 if (inclass)
00982                     goto error_out;
00983                 inclass = TRUE;
00984                 classname = ckd_salloc(wptr[1]);
00985             }
00986             /* Otherwise, just ignore whatever junk we got */
00987         }
00988     }
00989     rv = 0; /* Success. */
00990 
00991 error_out:
00992     /* Free all the stuff we might have allocated. */
00993     fclose_comp(fp, is_pipe);
00994     for (gn = classwords; gn; gn = gnode_next(gn))
00995         ckd_free(gnode_ptr(gn));
00996     glist_free(classwords);
00997     glist_free(classprobs);
00998     ckd_free(classname);
00999 
01000     return rv;
01001 }
01002 
01003 void
01004 classdef_free(classdef_t *classdef)
01005 {
01006     int32 i;
01007     for (i = 0; i < classdef->n_words; ++i)
01008         ckd_free(classdef->words[i]);
01009     ckd_free(classdef->words);
01010     ckd_free(classdef->weights);
01011     ckd_free(classdef);
01012 }
01013 
01014 
01015 int32
01016 ngram_model_read_classdef(ngram_model_t *model,
01017                           const char *file_name)
01018 {
01019     hash_table_t *classes;
01020     glist_t hl = NULL;
01021     gnode_t *gn;
01022     int32 rv = -1;
01023 
01024     classes = hash_table_new(0, FALSE);
01025     if (read_classdef_file(classes, file_name) < 0) {
01026         hash_table_free(classes);
01027         return -1;
01028     }
01029     
01030     /* Create a new class in the language model for each classdef. */
01031     hl = hash_table_tolist(classes, NULL);
01032     for (gn = hl; gn; gn = gnode_next(gn)) {
01033         hash_entry_t *he = gnode_ptr(gn);
01034         classdef_t *classdef = he->val;
01035 
01036         if (ngram_model_add_class(model, he->key, 1.0,
01037                                   classdef->words,
01038                                   classdef->weights,
01039                                   classdef->n_words) < 0)
01040             goto error_out;
01041     }
01042     rv = 0;
01043 
01044 error_out:
01045     for (gn = hl; gn; gn = gnode_next(gn)) {
01046         hash_entry_t *he = gnode_ptr(gn);
01047         ckd_free((char *)he->key);
01048         classdef_free(he->val);
01049     }
01050     glist_free(hl);
01051     hash_table_free(classes);
01052     return rv;
01053 }

Generated on Fri Jan 14 2011 for SphinxBase by  doxygen 1.7.1