• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/libsphinxbase/lm/ngram_model_set.c

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2008 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00042 #include "ngram_model_set.h"
00043 
00044 #include <err.h>
00045 #include <ckd_alloc.h>
00046 #include <strfuncs.h>
00047 #include <filename.h>
00048 
00049 #include <string.h>
00050 #include <stdlib.h>
00051 
00052 static ngram_funcs_t ngram_model_set_funcs;
00053 
00054 static int
00055 my_compare(const void *a, const void *b)
00056 {
00057     /* Make sure <UNK> floats to the beginning. */
00058     if (strcmp(*(char * const *)a, "<UNK>") == 0)
00059         return -1;
00060     else if (strcmp(*(char * const *)b, "<UNK>") == 0)
00061         return 1;
00062     else
00063         return strcmp(*(char * const *)a, *(char * const *)b);
00064 }
00065 
00066 static void
00067 build_widmap(ngram_model_t *base, logmath_t *lmath, int32 n)
00068 {
00069     ngram_model_set_t *set = (ngram_model_set_t *)base;
00070     ngram_model_t **models = set->lms;
00071     hash_table_t *vocab;
00072     glist_t hlist;
00073     gnode_t *gn;
00074     int32 i;
00075 
00076     /* Construct a merged vocabulary and a set of word-ID mappings. */
00077     vocab = hash_table_new(models[0]->n_words, FALSE);
00078     /* Create the set of merged words. */
00079     for (i = 0; i < set->n_models; ++i) {
00080         int32 j;
00081         for (j = 0; j < models[i]->n_words; ++j) {
00082             /* Ignore collisions. */
00083             (void)hash_table_enter_int32(vocab, models[i]->word_str[j], j);
00084         }
00085     }
00086     /* Create the array of words, then sort it. */
00087     if (hash_table_lookup(vocab, "<UNK>", NULL) != 0)
00088         (void)hash_table_enter_int32(vocab, "<UNK>", 0);
00089     /* Now we know the number of unigrams, initialize the base model. */
00090     ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab));
00091     base->writable = FALSE; /* We will reuse the pointers from the submodels. */
00092     i = 0;
00093     hlist = hash_table_tolist(vocab, NULL);
00094     for (gn = hlist; gn; gn = gnode_next(gn)) {
00095         hash_entry_t *ent = gnode_ptr(gn);
00096         base->word_str[i++] = (char *)ent->key;
00097     }
00098     glist_free(hlist);
00099     qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare);
00100 
00101     /* Now create the word ID mappings. */
00102     if (set->widmap)
00103         ckd_free_2d((void **)set->widmap);
00104     set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models,
00105                                            sizeof(**set->widmap));
00106     for (i = 0; i < base->n_words; ++i) {
00107         int32 j;
00108         /* Also create the master wid mapping. */
00109         (void)hash_table_enter_int32(base->wid, base->word_str[i], i);
00110         /* printf("%s: %d => ", base->word_str[i], i); */
00111         for (j = 0; j < set->n_models; ++j) {
00112             set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]);
00113             /* printf("%d ", set->widmap[i][j]); */
00114         }
00115         /* printf("\n"); */
00116     }
00117     hash_table_free(vocab);
00118 }
00119 
00120 ngram_model_t *
00121 ngram_model_set_init(cmd_ln_t *config,
00122                      ngram_model_t **models,
00123                      char **names,
00124                      const float32 *weights,
00125                      int32 n_models)
00126 {
00127     ngram_model_set_t *model;
00128     ngram_model_t *base;
00129     logmath_t *lmath;
00130     int32 i, n;
00131 
00132     if (n_models == 0) /* WTF */
00133         return NULL;
00134 
00135     /* Do consistency checking on the models.  They must all use the
00136      * same logbase and shift. */
00137     lmath = models[0]->lmath;
00138     for (i = 0; i < n_models; ++i) {
00139         if (logmath_get_base(models[i]->lmath) != logmath_get_base(lmath)
00140             || logmath_get_shift(models[i]->lmath) != logmath_get_shift(lmath)) {
00141             E_ERROR("Log-math parameters don't match, will not create LM set\n");
00142             return NULL;
00143         }
00144     }
00145 
00146     /* Allocate the combined model, initialize it. */
00147     model = ckd_calloc(1, sizeof(*model));
00148     base = &model->base;
00149     model->n_models = n_models;
00150     model->lms = ckd_calloc(n_models, sizeof(*model->lms));
00151     model->names = ckd_calloc(n_models, sizeof(*model->names));
00152     /* Initialize weights to a uniform distribution */
00153     model->lweights = ckd_calloc(n_models, sizeof(*model->lweights));
00154     {
00155         int32 uniform = logmath_log(lmath, 1.0/n_models);
00156         for (i = 0; i < n_models; ++i)
00157             model->lweights[i] = uniform;
00158     }
00159     /* Default to interpolate if weights were given. */
00160     if (weights)
00161         model->cur = -1;
00162 
00163     n = 0;
00164     for (i = 0; i < n_models; ++i) {
00165         model->lms[i] = models[i];
00166         model->names[i] = ckd_salloc(names[i]);
00167         if (weights)
00168             model->lweights[i] = logmath_log(lmath, weights[i]);
00169         /* N is the maximum of all merged models. */
00170         if (models[i]->n > n)
00171             n = models[i]->n;
00172     }
00173     /* Allocate the history mapping table. */
00174     model->maphist = ckd_calloc(n - 1, sizeof(*model->maphist));
00175 
00176     /* Now build the word-ID mapping and merged vocabulary. */
00177     build_widmap(base, lmath, n);
00178     return base;
00179 }
00180 
00181 ngram_model_t *
00182 ngram_model_set_read(cmd_ln_t *config,
00183                      const char *lmctlfile,
00184                      logmath_t *lmath)
00185 {
00186     FILE *ctlfp;
00187     glist_t lms = NULL;
00188     glist_t lmnames = NULL;
00189     __BIGSTACKVARIABLE__ char str[1024];
00190     ngram_model_t *set = NULL;
00191     hash_table_t *classes;
00192     char *basedir, *c;
00193 
00194     /* Read all the class definition files to accumulate a mapping of
00195      * classnames to definitions. */
00196     classes = hash_table_new(0, FALSE);
00197     if ((ctlfp = fopen(lmctlfile, "r")) == NULL) {
00198         E_ERROR_SYSTEM("Failed to open %s", lmctlfile);
00199         return NULL;
00200     }
00201 
00202     /* Try to find the base directory to append to relative paths in
00203      * the lmctl file. */
00204     if ((c = strrchr(lmctlfile, '/')) || (c = strrchr(lmctlfile, '\\'))) {
00205         /* Include the trailing slash. */
00206         basedir = ckd_calloc(c - lmctlfile + 2, 1);
00207         memcpy(basedir, lmctlfile, c - lmctlfile + 1);
00208     }
00209     else {
00210         basedir = NULL;
00211     }
00212     E_INFO("Reading LM control file '%s'\n", lmctlfile);
00213     if (basedir)
00214         E_INFO("Will prepend '%s' to unqualified paths\n", basedir);
00215 
00216     if (fscanf(ctlfp, "%1023s", str) == 1) {
00217         if (strcmp(str, "{") == 0) {
00218             /* Load LMclass files */
00219             while ((fscanf(ctlfp, "%1023s", str) == 1)
00220                    && (strcmp(str, "}") != 0)) {
00221                 char *deffile;
00222                 if (basedir && !path_is_absolute(str))
00223                     deffile = string_join(basedir, str, NULL);
00224                 else
00225                     deffile = ckd_salloc(str);
00226                 E_INFO("Reading classdef from '%s'\n", deffile);
00227                 if (read_classdef_file(classes, deffile) < 0) {
00228                     ckd_free(deffile);
00229                     goto error_out;
00230                 }
00231                 ckd_free(deffile);
00232             }
00233 
00234             if (strcmp(str, "}") != 0) {
00235                 E_ERROR("Unexpected EOF in %s\n", lmctlfile);
00236                 goto error_out;
00237             }
00238 
00239             /* This might be the first LM name. */
00240             if (fscanf(ctlfp, "%1023s", str) != 1)
00241                 str[0] = '\0';
00242         }
00243     }
00244     else
00245         str[0] = '\0';
00246 
00247     /* Read in one LM at a time and add classes to them as necessary. */
00248     while (str[0] != '\0') {
00249         char *lmfile;
00250         ngram_model_t *lm;
00251 
00252         if (basedir && str[0] != '/' && str[0] != '\\')
00253             lmfile = string_join(basedir, str, NULL);
00254         else
00255             lmfile = ckd_salloc(str);
00256         E_INFO("Reading lm from '%s'\n", lmfile);
00257         lm = ngram_model_read(config, lmfile, NGRAM_AUTO, lmath);
00258         if (lm == NULL) {
00259             ckd_free(lmfile);
00260             goto error_out;
00261         }
00262         if (fscanf(ctlfp, "%1023s", str) != 1) {
00263             E_ERROR("LMname missing after LMFileName '%s'\n", lmfile);
00264             ckd_free(lmfile);
00265             goto error_out;
00266         }
00267         ckd_free(lmfile);
00268         lms = glist_add_ptr(lms, lm);
00269         lmnames = glist_add_ptr(lmnames, ckd_salloc(str));
00270 
00271         if (fscanf(ctlfp, "%1023s", str) == 1) {
00272             if (strcmp(str, "{") == 0) {
00273                 /* LM uses classes; read their names */
00274                 while ((fscanf(ctlfp, "%1023s", str) == 1) &&
00275                        (strcmp(str, "}") != 0)) {
00276                     void *val;
00277                     classdef_t *classdef;
00278 
00279                     E_INFO("Adding class '%s'\n", str);
00280                     if (hash_table_lookup(classes, str, &val) == -1) {
00281                         E_ERROR("Unknown class %s in control file\n", str);
00282                         goto error_out;
00283                     }
00284                     classdef = val;
00285                     if (ngram_model_add_class(lm, str, 1.0,
00286                                               classdef->words, classdef->weights,
00287                                               classdef->n_words) < 0) {
00288                         goto error_out;
00289                     }
00290                 }
00291                 if (strcmp(str, "}") != 0) {
00292                     E_ERROR("Unexpected EOF in %s\n", lmctlfile);
00293                     goto error_out;
00294                 }
00295                 if (fscanf(ctlfp, "%1023s", str) != 1)
00296                     str[0] = '\0';
00297             }
00298         }
00299         else
00300             str[0] = '\0';
00301     }
00302     fclose(ctlfp);
00303 
00304     /* Now construct arrays out of lms and lmnames, and build an
00305      * ngram_model_set. */
00306     lms = glist_reverse(lms);
00307     lmnames = glist_reverse(lmnames);
00308     {
00309         int32 n_models;
00310         ngram_model_t **lm_array;
00311         char **name_array;
00312         gnode_t *lm_node, *name_node;
00313         int32 i;
00314 
00315         n_models = glist_count(lms);
00316         lm_array = ckd_calloc(n_models, sizeof(*lm_array));
00317         name_array = ckd_calloc(n_models, sizeof(*name_array));
00318         lm_node = lms;
00319         name_node = lmnames;
00320         for (i = 0; i < n_models; ++i) {
00321             lm_array[i] = gnode_ptr(lm_node);
00322             name_array[i] = gnode_ptr(name_node);
00323             lm_node = gnode_next(lm_node);
00324             name_node = gnode_next(name_node);
00325         }
00326         set = ngram_model_set_init(config, lm_array, name_array,
00327                                    NULL, n_models);
00328         ckd_free(lm_array);
00329         ckd_free(name_array);
00330     }
00331 error_out:
00332     {
00333         gnode_t *gn;
00334         glist_t hlist;
00335 
00336         if (set == NULL) {
00337             for (gn = lms; gn; gn = gnode_next(gn)) {
00338                 ngram_model_free(gnode_ptr(gn));
00339             }
00340         }
00341         glist_free(lms);
00342         for (gn = lmnames; gn; gn = gnode_next(gn)) {
00343             ckd_free(gnode_ptr(gn));
00344         }
00345         glist_free(lmnames);
00346         hlist = hash_table_tolist(classes, NULL);
00347         for (gn = hlist; gn; gn = gnode_next(gn)) {
00348             hash_entry_t *he = gnode_ptr(gn);
00349             ckd_free((char *)he->key);
00350             classdef_free(he->val);
00351         }
00352         glist_free(hlist);
00353         hash_table_free(classes);
00354         ckd_free(basedir);
00355     }
00356     return set;
00357 }
00358 
00359 int32
00360 ngram_model_set_count(ngram_model_t *base)
00361 {
00362     ngram_model_set_t *set = (ngram_model_set_t *)base;
00363     return set->n_models;
00364 }
00365 
00366 ngram_model_set_iter_t *
00367 ngram_model_set_iter(ngram_model_t *base)
00368 {
00369     ngram_model_set_t *set = (ngram_model_set_t *)base;
00370     ngram_model_set_iter_t *itor;
00371 
00372     if (set == NULL || set->n_models == 0)
00373         return NULL;
00374     itor = ckd_calloc(1, sizeof(*itor));
00375     itor->set = set;
00376     return itor;
00377 }
00378 
00379 ngram_model_set_iter_t *
00380 ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
00381 {
00382     if (++itor->cur == itor->set->n_models) {
00383         ngram_model_set_iter_free(itor);
00384         return NULL;
00385     }
00386     return itor;
00387 }
00388 
00389 void
00390 ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
00391 {
00392     ckd_free(itor);
00393 }
00394 
00395 ngram_model_t *
00396 ngram_model_set_iter_model(ngram_model_set_iter_t *itor,
00397                            char const **lmname)
00398 {
00399     if (lmname) *lmname = itor->set->names[itor->cur];
00400     return itor->set->lms[itor->cur];
00401 }
00402 
00403 ngram_model_t *
00404 ngram_model_set_lookup(ngram_model_t *base,
00405                        const char *name)
00406 {
00407     ngram_model_set_t *set = (ngram_model_set_t *)base;
00408     int32 i;
00409 
00410     if (name == NULL) {
00411         if (set->cur == -1)
00412             return NULL;
00413         else
00414             return set->lms[set->cur];
00415     }
00416 
00417     /* There probably won't be very many submodels. */
00418     for (i = 0; i < set->n_models; ++i)
00419         if (0 == strcmp(set->names[i], name))
00420             break;
00421     if (i == set->n_models)
00422         return NULL;
00423     return set->lms[i];
00424 }
00425 
00426 ngram_model_t *
00427 ngram_model_set_select(ngram_model_t *base,
00428                        const char *name)
00429 {
00430     ngram_model_set_t *set = (ngram_model_set_t *)base;
00431     int32 i;
00432 
00433     /* There probably won't be very many submodels. */
00434     for (i = 0; i < set->n_models; ++i)
00435         if (0 == strcmp(set->names[i], name))
00436             break;
00437     if (i == set->n_models)
00438         return NULL;
00439     set->cur = i;
00440     return set->lms[set->cur];
00441 }
00442 
00443 const char *
00444 ngram_model_set_current(ngram_model_t *base)
00445 {
00446     ngram_model_set_t *set = (ngram_model_set_t *)base;
00447 
00448     if (set->cur == -1)
00449         return NULL;
00450     else
00451         return set->names[set->cur];
00452 }
00453 
00454 int32
00455 ngram_model_set_current_wid(ngram_model_t *base,
00456                             int32 set_wid)
00457 {
00458     ngram_model_set_t *set = (ngram_model_set_t *)base;
00459 
00460     if (set->cur == -1 || set_wid >= base->n_words)
00461         return NGRAM_INVALID_WID;
00462     else
00463         return set->widmap[set->cur][set_wid];
00464 }
00465 
00466 int32
00467 ngram_model_set_known_wid(ngram_model_t *base,
00468                           int32 set_wid)
00469 {
00470     ngram_model_set_t *set = (ngram_model_set_t *)base;
00471 
00472     if (set_wid >= base->n_words)
00473         return FALSE;
00474     else if (set->cur == -1) {
00475         int32 i;
00476         for (i = 0; i < set->n_models; ++i) {
00477             if (set->widmap[i][set_wid] != ngram_unknown_wid(set->lms[i]))
00478                 return TRUE;
00479         }
00480         return FALSE;
00481     }
00482     else
00483         return (set->widmap[set_wid][set->cur]
00484                 != ngram_unknown_wid(set->lms[set->cur]));
00485 }
00486 
00487 ngram_model_t *
00488 ngram_model_set_interp(ngram_model_t *base,
00489                        const char **names,
00490                        const float32 *weights)
00491 {
00492     ngram_model_set_t *set = (ngram_model_set_t *)base;
00493 
00494     /* If we have a set of weights here, then set them. */
00495     if (names && weights) {
00496         int32 i, j;
00497 
00498         /* We hope there aren't many models. */
00499         for (i = 0; i < set->n_models; ++i) {
00500             for (j = 0; j < set->n_models; ++j)
00501                 if (0 == strcmp(names[i], set->names[j]))
00502                     break;
00503             if (j == set->n_models) {
00504                 E_ERROR("Unknown LM name %s\n", names[i]);
00505                 return NULL;
00506             }
00507             set->lweights[j] = logmath_log(base->lmath, weights[i]);
00508         }
00509     }
00510     else if (weights) {
00511         memcpy(set->lweights, weights, set->n_models * sizeof(*set->lweights));
00512     }
00513     /* Otherwise just enable existing weights. */
00514     set->cur = -1;
00515     return base;
00516 }
00517 
00518 ngram_model_t *
00519 ngram_model_set_add(ngram_model_t *base,
00520                     ngram_model_t *model,
00521                     const char *name,
00522                     float32 weight,
00523                     int reuse_widmap)
00524                     
00525 {
00526     ngram_model_set_t *set = (ngram_model_set_t *)base;
00527     float32 fprob;
00528     int32 scale, i;
00529 
00530     /* Add it to the array of lms. */
00531     ++set->n_models;
00532     set->lms = ckd_realloc(set->lms, set->n_models * sizeof(*set->lms));
00533     set->lms[set->n_models - 1] = model;
00534     set->names = ckd_realloc(set->names, set->n_models * sizeof(*set->names));
00535     set->names[set->n_models - 1] = ckd_salloc(name);
00536     /* Expand the history mapping table if necessary. */
00537     if (model->n > base->n) {
00538         base->n = model->n;
00539         set->maphist = ckd_realloc(set->maphist,
00540                                    (model->n - 1) * sizeof(*set->maphist));
00541     }
00542 
00543     /* Renormalize the interpolation weights. */
00544     fprob = weight * 1.0f / set->n_models;
00545     set->lweights = ckd_realloc(set->lweights,
00546                                 set->n_models * sizeof(*set->lweights));
00547     set->lweights[set->n_models - 1] = logmath_log(base->lmath, fprob);
00548     /* Now normalize everything else to fit it in.  This is
00549      * accomplished by simply scaling all the other probabilities
00550      * by (1-fprob). */
00551     scale = logmath_log(base->lmath, 1.0 - fprob);
00552     for (i = 0; i < set->n_models - 1; ++i)
00553         set->lweights[i] += scale;
00554 
00555     /* Reuse the old word ID mapping if requested. */
00556     if (reuse_widmap) {
00557         int32 **new_widmap;
00558 
00559         /* Tack another column onto the widmap array. */
00560         new_widmap = (int32 **)ckd_calloc_2d(base->n_words, set->n_models,
00561                                              sizeof (**new_widmap));
00562         for (i = 0; i < base->n_words; ++i) {
00563             /* Copy all the existing mappings. */
00564             memcpy(new_widmap[i], set->widmap[i],
00565                    (set->n_models - 1) * sizeof(**new_widmap));
00566             /* Create the new mapping. */
00567             new_widmap[i][set->n_models-1] = ngram_wid(model, base->word_str[i]);
00568         }
00569         ckd_free_2d((void **)set->widmap);
00570         set->widmap = new_widmap;
00571     }
00572     else {
00573         build_widmap(base, base->lmath, base->n);
00574     }
00575     return model;
00576 }
00577 
00578 ngram_model_t *
00579 ngram_model_set_remove(ngram_model_t *base,
00580                        const char *name,
00581                        int reuse_widmap)
00582 {
00583     ngram_model_set_t *set = (ngram_model_set_t *)base;
00584     ngram_model_t *submodel;
00585     int32 lmidx, scale, n, i;
00586     float32 fprob;
00587 
00588     for (lmidx = 0; lmidx < set->n_models; ++lmidx)
00589         if (0 == strcmp(name, set->names[lmidx]))
00590             break;
00591     if (lmidx == set->n_models)
00592         return NULL;
00593     submodel = set->lms[lmidx];
00594 
00595     /* Renormalize the interpolation weights by scaling them by
00596      * 1/(1-fprob) */
00597     fprob = (float32)logmath_exp(base->lmath, set->lweights[lmidx]);
00598     scale = logmath_log(base->lmath, 1.0 - fprob);
00599 
00600     /* Remove it from the array of lms, renormalize remaining weights,
00601      * and recalcluate n. */
00602     --set->n_models;
00603     n = 0;
00604     ckd_free(set->names[lmidx]);
00605     set->names[lmidx] = NULL;
00606     for (i = 0; i < set->n_models; ++i) {
00607         if (i >= lmidx) {
00608             set->lms[i] = set->lms[i+1];
00609             set->names[i] = set->names[i+1];
00610             set->lweights[i] = set->lweights[i+1];
00611         }
00612         set->lweights[i] -= scale;
00613         if (set->lms[i]->n > n)
00614             n = set->lms[i]->n;
00615     }
00616     /* There's no need to shrink these arrays. */
00617     set->lms[set->n_models] = NULL;
00618     set->lweights[set->n_models] = base->log_zero;
00619     /* No need to shrink maphist either. */
00620 
00621     /* Reuse the existing word ID mapping if requested. */
00622     if (reuse_widmap) {
00623         /* Just go through and shrink each row. */
00624         for (i = 0; i < base->n_words; ++i) {
00625             memmove(set->widmap[i] + lmidx, set->widmap[i] + lmidx + 1,
00626                     (set->n_models - lmidx) * sizeof(**set->widmap));
00627         }
00628     }
00629     else {
00630         build_widmap(base, base->lmath, n);
00631     }
00632     return submodel;
00633 }
00634 
00635 void
00636 ngram_model_set_map_words(ngram_model_t *base,
00637                           const char **words,
00638                           int32 n_words)
00639 {
00640     ngram_model_set_t *set = (ngram_model_set_t *)base;
00641     int32 i;
00642 
00643     /* Recreate the word mapping. */
00644     if (base->writable) {
00645         for (i = 0; i < base->n_words; ++i) {
00646             ckd_free(base->word_str[i]);
00647         }
00648     }
00649     ckd_free(base->word_str);
00650     ckd_free_2d((void **)set->widmap);
00651     base->writable = TRUE;
00652     base->n_words = base->n_1g_alloc = n_words;
00653     base->word_str = ckd_calloc(n_words, sizeof(*base->word_str));
00654     set->widmap = (int32 **)ckd_calloc_2d(n_words, set->n_models, sizeof(**set->widmap));
00655     hash_table_empty(base->wid);
00656     for (i = 0; i < n_words; ++i) {
00657         int32 j;
00658         base->word_str[i] = ckd_salloc(words[i]);
00659         (void)hash_table_enter_int32(base->wid, base->word_str[i], i);
00660         for (j = 0; j < set->n_models; ++j) {
00661             set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]);
00662         }
00663     }
00664 }
00665 
00666 static int
00667 ngram_model_set_apply_weights(ngram_model_t *base, float32 lw,
00668                               float32 wip, float32 uw)
00669 {
00670     ngram_model_set_t *set = (ngram_model_set_t *)base;
00671     int32 i;
00672 
00673     /* Apply weights to each sub-model. */
00674     for (i = 0; i < set->n_models; ++i)
00675         ngram_model_apply_weights(set->lms[i], lw, wip, uw);
00676     return 0;
00677 }
00678 
00679 static int32
00680 ngram_model_set_score(ngram_model_t *base, int32 wid,
00681                       int32 *history, int32 n_hist,
00682                       int32 *n_used)
00683 {
00684     ngram_model_set_t *set = (ngram_model_set_t *)base;
00685     int32 mapwid;
00686     int32 score;
00687     int32 i;
00688 
00689     /* Truncate the history. */
00690     if (n_hist > base->n - 1)
00691         n_hist = base->n - 1;
00692 
00693     /* Interpolate if there is no current. */
00694     if (set->cur == -1) {
00695         score = base->log_zero;
00696         for (i = 0; i < set->n_models; ++i) {
00697             int32 j;
00698             /* Map word and history IDs for each model. */
00699             mapwid = set->widmap[wid][i];
00700             for (j = 0; j < n_hist; ++j) {
00701                 if (history[j] == NGRAM_INVALID_WID)
00702                     set->maphist[j] = NGRAM_INVALID_WID;
00703                 else
00704                     set->maphist[j] = set->widmap[history[j]][i];
00705             }
00706             score = logmath_add(base->lmath, score,
00707                                 set->lweights[i] + 
00708                                 ngram_ng_score(set->lms[i],
00709                                                mapwid, set->maphist, n_hist, n_used));
00710         }
00711     }
00712     else {
00713         int32 j;
00714         /* Map word and history IDs (FIXME: do this in a function?) */
00715         mapwid = set->widmap[wid][set->cur];
00716         for (j = 0; j < n_hist; ++j) {
00717             if (history[j] == NGRAM_INVALID_WID)
00718                 set->maphist[j] = NGRAM_INVALID_WID;
00719             else
00720                 set->maphist[j] = set->widmap[history[j]][set->cur];
00721         }
00722         score = ngram_ng_score(set->lms[set->cur],
00723                                mapwid, set->maphist, n_hist, n_used);
00724     }
00725 
00726     return score;
00727 }
00728 
00729 static int32
00730 ngram_model_set_raw_score(ngram_model_t *base, int32 wid,
00731                           int32 *history, int32 n_hist,
00732                           int32 *n_used)
00733 {
00734     ngram_model_set_t *set = (ngram_model_set_t *)base;
00735     int32 mapwid;
00736     int32 score;
00737     int32 i;
00738 
00739     /* Truncate the history. */
00740     if (n_hist > base->n - 1)
00741         n_hist = base->n - 1;
00742 
00743     /* Interpolate if there is no current. */
00744     if (set->cur == -1) {
00745         score = base->log_zero;
00746         for (i = 0; i < set->n_models; ++i) {
00747             int32 j;
00748             /* Map word and history IDs for each model. */
00749             mapwid = set->widmap[wid][i];
00750             for (j = 0; j < n_hist; ++j) {
00751                 if (history[j] == NGRAM_INVALID_WID)
00752                     set->maphist[j] = NGRAM_INVALID_WID;
00753                 else
00754                     set->maphist[j] = set->widmap[history[j]][i];
00755             }
00756             score = logmath_add(base->lmath, score,
00757                                 set->lweights[i] + 
00758                                 ngram_ng_prob(set->lms[i],
00759                                               mapwid, set->maphist, n_hist, n_used));
00760         }
00761     }
00762     else {
00763         int32 j;
00764         /* Map word and history IDs (FIXME: do this in a function?) */
00765         mapwid = set->widmap[wid][set->cur];
00766         for (j = 0; j < n_hist; ++j) {
00767             if (history[j] == NGRAM_INVALID_WID)
00768                 set->maphist[j] = NGRAM_INVALID_WID;
00769             else
00770                 set->maphist[j] = set->widmap[history[j]][set->cur];
00771         }
00772         score = ngram_ng_prob(set->lms[set->cur],
00773                               mapwid, set->maphist, n_hist, n_used);
00774     }
00775 
00776     return score;
00777 }
00778 
00779 static int32
00780 ngram_model_set_add_ug(ngram_model_t *base,
00781                        int32 wid, int32 lweight)
00782 {
00783     ngram_model_set_t *set = (ngram_model_set_t *)base;
00784     int32 *newwid;
00785     int32 i, prob;
00786 
00787     /* At this point the word has already been added to the master
00788        model and we have a new word ID for it.  Add it to active
00789        submodels and track the word IDs. */
00790     newwid = ckd_calloc(set->n_models, sizeof(*newwid));
00791     prob = base->log_zero;
00792     for (i = 0; i < set->n_models; ++i) {
00793         int32 wprob, n_hist;
00794 
00795         /* Only add to active models. */
00796         if (set->cur == -1 || set->cur == i) {
00797             /* Did this word already exist? */
00798             newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]);
00799             if (newwid[i] == NGRAM_INVALID_WID) {
00800                 /* Add it to the submodel. */
00801                 newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid],
00802                                                  (float32)logmath_exp(base->lmath, lweight));
00803                 if (newwid[i] == NGRAM_INVALID_WID) {
00804                     ckd_free(newwid);
00805                     return base->log_zero;
00806                 }
00807             }
00808             /* Now get the unigram probability for the new word and either
00809              * interpolate it or use it (if this is the current model). */
00810             wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist);
00811             if (set->cur == i)
00812                 prob = wprob;
00813             else if (set->cur == -1)
00814                 prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob);
00815         }
00816         else {
00817             newwid[i] = NGRAM_INVALID_WID;
00818         }
00819     }
00820     /* Okay we have the word IDs for this in all the submodels.  Now
00821        do some complicated memory mangling to add this to the
00822        widmap. */
00823     set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap));
00824     set->widmap[0] = ckd_realloc(set->widmap[0],
00825                                  base->n_words
00826                                  * set->n_models
00827                                  * sizeof(**set->widmap));
00828     for (i = 0; i < base->n_words; ++i)
00829         set->widmap[i] = set->widmap[0] + i * set->n_models;
00830     memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid));
00831     ckd_free(newwid);
00832     return prob;
00833 }
00834 
00835 static void
00836 ngram_model_set_free(ngram_model_t *base)
00837 {
00838     ngram_model_set_t *set = (ngram_model_set_t *)base;
00839     int32 i;
00840 
00841     for (i = 0; i < set->n_models; ++i)
00842         ngram_model_free(set->lms[i]);
00843     ckd_free(set->lms);
00844     for (i = 0; i < set->n_models; ++i)
00845         ckd_free(set->names[i]);
00846     ckd_free(set->names);
00847     ckd_free(set->lweights);
00848     ckd_free(set->maphist);
00849     ckd_free_2d((void **)set->widmap);
00850 }
00851 
00852 static void
00853 ngram_model_set_flush(ngram_model_t *base)
00854 {
00855     ngram_model_set_t *set = (ngram_model_set_t *)base;
00856     int32 i;
00857 
00858     for (i = 0; i < set->n_models; ++i)
00859         ngram_model_flush(set->lms[i]);
00860 }
00861 
00862 static ngram_funcs_t ngram_model_set_funcs = {
00863     ngram_model_set_free,          /* free */
00864     ngram_model_set_apply_weights, /* apply_weights */
00865     ngram_model_set_score,         /* score */
00866     ngram_model_set_raw_score,     /* raw_score */
00867     ngram_model_set_add_ug,        /* add_ug */
00868     ngram_model_set_flush          /* flush */
00869 };

Generated on Fri Jan 14 2011 for SphinxBase by  doxygen 1.7.1