• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/libsphinxbase/lm/lm3g_model.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file lm3g_model.c Core Sphinx 3-gram code used in
00039  * DMP/DMP32/ARPA (for now) model code.
00040  *
00041  * Author: A cast of thousands, probably.
00042  */
00043 #include "lm3g_model.h"
00044 #include "listelem_alloc.h"
00045 #include "ckd_alloc.h"
00046 
00047 #include <string.h>
00048 #include <assert.h>
00049 
00050 void
00051 lm3g_tginfo_free(ngram_model_t *base, lm3g_model_t *lm3g)
00052 {
00053         if (lm3g->tginfo == NULL)
00054                 return;
00055         listelem_alloc_free(lm3g->le);
00056         ckd_free(lm3g->tginfo);
00057 }
00058 
00059 void
00060 lm3g_tginfo_reset(ngram_model_t *base, lm3g_model_t *lm3g)
00061 {
00062     if (lm3g->tginfo == NULL)
00063         return;
00064     listelem_alloc_free(lm3g->le);
00065     memset(lm3g->tginfo, 0, base->n_counts[0] * sizeof(tginfo_t *));
00066     lm3g->le = listelem_alloc_init(sizeof(tginfo_t));
00067 }
00068 
00069 void
00070 lm3g_apply_weights(ngram_model_t *base,
00071                    lm3g_model_t *lm3g,
00072                    float32 lw, float32 wip, float32 uw)
00073 {
00074     int32 log_wip, log_uw, log_uniform_weight;
00075     int i;
00076 
00077     /* Precalculate some log values we will like. */
00078     log_wip = logmath_log(base->lmath, wip);
00079     log_uw = logmath_log(base->lmath, uw);
00080     log_uniform_weight = logmath_log(base->lmath, 1.0 - uw);
00081 
00082     for (i = 0; i < base->n_counts[0]; ++i) {
00083         int32 prob1, bo_wt, n_used;
00084 
00085         /* Backoff weights just get scaled by the lw. */
00086         bo_wt = (int32)(lm3g->unigrams[i].bo_wt1.l / base->lw);
00087         /* Unscaling unigram probs is a bit more complicated, so punt
00088          * it back to the general code. */
00089         prob1 = ngram_ng_prob(base, i, NULL, 0, &n_used);
00090         /* Now compute the new scaled probabilities. */
00091         lm3g->unigrams[i].bo_wt1.l = (int32)(bo_wt * lw);
00092         if (strcmp(base->word_str[i], "<s>") == 0) { /* FIXME: configurable start_sym */
00093             /* Apply language weight and WIP */
00094             lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
00095         }
00096         else {
00097             /* Interpolate unigram probability with uniform. */
00098             prob1 += log_uw;
00099             prob1 = logmath_add(base->lmath, prob1, base->log_uniform + log_uniform_weight);
00100             /* Apply language weight and WIP */
00101             lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
00102         }
00103     }
00104 
00105     for (i = 0; i < lm3g->n_prob2; ++i) {
00106         int32 prob2;
00107         /* Can't just punt this back to general code since it is quantized. */
00108         prob2 = (int32)((lm3g->prob2[i].l - base->log_wip) / base->lw);
00109         lm3g->prob2[i].l = (int32)(prob2 * lw) + log_wip;
00110     }
00111 
00112     if (base->n > 2) {
00113         for (i = 0; i < lm3g->n_bo_wt2; ++i) {
00114             lm3g->bo_wt2[i].l = (int32)(lm3g->bo_wt2[i].l  / base->lw * lw);
00115         }
00116         for (i = 0; i < lm3g->n_prob3; i++) {
00117             int32 prob3;
00118             /* Can't just punt this back to general code since it is quantized. */
00119             prob3 = (int32)((lm3g->prob3[i].l - base->log_wip) / base->lw);
00120             lm3g->prob3[i].l = (int32)(prob3 * lw) + log_wip;
00121         }
00122     }
00123 
00124     /* Store updated values in the model. */
00125     base->log_wip = log_wip;
00126     base->log_uw = log_uw;
00127     base->log_uniform_weight = log_uniform_weight;
00128     base->lw = lw;
00129 }
00130 
00131 int32
00132 lm3g_add_ug(ngram_model_t *base,
00133             lm3g_model_t *lm3g, int32 wid, int32 lweight)
00134 {
00135     int32 score;
00136 
00137     /* This would be very bad if this happened! */
00138     assert(!NGRAM_IS_CLASSWID(wid));
00139 
00140     /* Reallocate unigram array. */
00141     lm3g->unigrams = ckd_realloc(lm3g->unigrams,
00142                                  sizeof(*lm3g->unigrams) * base->n_1g_alloc);
00143     memset(lm3g->unigrams + base->n_counts[0], 0,
00144            (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->unigrams));
00145     /* Reallocate tginfo array. */
00146     lm3g->tginfo = ckd_realloc(lm3g->tginfo,
00147                                sizeof(*lm3g->tginfo) * base->n_1g_alloc);
00148     memset(lm3g->tginfo + base->n_counts[0], 0,
00149            (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->tginfo));
00150     /* FIXME: we really ought to update base->log_uniform *and*
00151      * renormalize all the other unigrams.  This is really slow, so I
00152      * will probably just provide a function to renormalize after
00153      * adding unigrams, for anyone who really cares. */
00154     /* This could be simplified but then we couldn't do it in logmath */
00155     score = lweight + base->log_uniform + base->log_uw;
00156     score = logmath_add(base->lmath, score,
00157                         base->log_uniform + base->log_uniform_weight);
00158     lm3g->unigrams[wid].prob1.l = score;
00159     /* This unigram by definition doesn't participate in any bigrams,
00160      * so its backoff weight and bigram pointer are both undefined. */
00161     lm3g->unigrams[wid].bo_wt1.l = logmath_get_zero(base->lmath);
00162     lm3g->unigrams[wid].bigrams = 0;
00163     /* Finally, increase the unigram count */
00164     ++base->n_counts[0];
00165     /* FIXME: Note that this can actually be quite bogus due to the
00166      * presence of class words.  If wid falls outside the unigram
00167      * count, increase it to compensate, at the cost of no longer
00168      * really knowing how many unigrams we have :( */
00169     if (wid >= base->n_counts[0])
00170         base->n_counts[0] = wid + 1;
00171 
00172     return score;
00173 }

Generated on Fri Jan 14 2011 for SphinxBase by  doxygen 1.7.1