• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/libsphinxbase/lm/lm3g_model.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file lm3g_model.c Core Sphinx 3-gram code used in
00039  * DMP/DMP32/ARPA (for now) model code.
00040  *
00041  * Author: A cast of thousands, probably.
00042  */
00043 #include "lm3g_model.h"
00044 #include "listelem_alloc.h"
00045 #include "ckd_alloc.h"
00046 #include "err.h"
00047 
00048 #include <string.h>
00049 #include <assert.h>
00050 #include <limits.h>
00051 
00052 void
00053 lm3g_tginfo_free(ngram_model_t *base, lm3g_model_t *lm3g)
00054 {
00055         if (lm3g->tginfo == NULL)
00056                 return;
00057         listelem_alloc_free(lm3g->le);
00058         ckd_free(lm3g->tginfo);
00059 }
00060 
00061 void
00062 lm3g_tginfo_reset(ngram_model_t *base, lm3g_model_t *lm3g)
00063 {
00064     if (lm3g->tginfo == NULL)
00065         return;
00066     listelem_alloc_free(lm3g->le);
00067     memset(lm3g->tginfo, 0, base->n_counts[0] * sizeof(tginfo_t *));
00068     lm3g->le = listelem_alloc_init(sizeof(tginfo_t));
00069 }
00070 
00071 void
00072 lm3g_apply_weights(ngram_model_t *base,
00073                    lm3g_model_t *lm3g,
00074                    float32 lw, float32 wip, float32 uw)
00075 {
00076     int32 log_wip, log_uw, log_uniform_weight;
00077     int i;
00078 
00079     /* Precalculate some log values we will like. */
00080     log_wip = logmath_log(base->lmath, wip);
00081     log_uw = logmath_log(base->lmath, uw);
00082     log_uniform_weight = logmath_log(base->lmath, 1.0 - uw);
00083 
00084     for (i = 0; i < base->n_counts[0]; ++i) {
00085         int32 prob1, bo_wt, n_used;
00086 
00087         /* Backoff weights just get scaled by the lw. */
00088         bo_wt = (int32)(lm3g->unigrams[i].bo_wt1.l / base->lw);
00089         /* Unscaling unigram probs is a bit more complicated, so punt
00090          * it back to the general code. */
00091         prob1 = ngram_ng_prob(base, i, NULL, 0, &n_used);
00092         /* Now compute the new scaled probabilities. */
00093         lm3g->unigrams[i].bo_wt1.l = (int32)(bo_wt * lw);
00094         if (strcmp(base->word_str[i], "<s>") == 0) { /* FIXME: configurable start_sym */
00095             /* Apply language weight and WIP */
00096             lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
00097         }
00098         else {
00099             /* Interpolate unigram probability with uniform. */
00100             prob1 += log_uw;
00101             prob1 = logmath_add(base->lmath, prob1, base->log_uniform + log_uniform_weight);
00102             /* Apply language weight and WIP */
00103             lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
00104         }
00105     }
00106 
00107     for (i = 0; i < lm3g->n_prob2; ++i) {
00108         int32 prob2;
00109         /* Can't just punt this back to general code since it is quantized. */
00110         prob2 = (int32)((lm3g->prob2[i].l - base->log_wip) / base->lw);
00111         lm3g->prob2[i].l = (int32)(prob2 * lw) + log_wip;
00112     }
00113 
00114     if (base->n > 2) {
00115         for (i = 0; i < lm3g->n_bo_wt2; ++i) {
00116             lm3g->bo_wt2[i].l = (int32)(lm3g->bo_wt2[i].l  / base->lw * lw);
00117         }
00118         for (i = 0; i < lm3g->n_prob3; i++) {
00119             int32 prob3;
00120             /* Can't just punt this back to general code since it is quantized. */
00121             prob3 = (int32)((lm3g->prob3[i].l - base->log_wip) / base->lw);
00122             lm3g->prob3[i].l = (int32)(prob3 * lw) + log_wip;
00123         }
00124     }
00125 
00126     /* Store updated values in the model. */
00127     base->log_wip = log_wip;
00128     base->log_uw = log_uw;
00129     base->log_uniform_weight = log_uniform_weight;
00130     base->lw = lw;
00131 }
00132 
00133 int32
00134 lm3g_add_ug(ngram_model_t *base,
00135             lm3g_model_t *lm3g, int32 wid, int32 lweight)
00136 {
00137     int32 score;
00138 
00139     /* This would be very bad if this happened! */
00140     assert(!NGRAM_IS_CLASSWID(wid));
00141 
00142     /* Reallocate unigram array. */
00143     lm3g->unigrams = ckd_realloc(lm3g->unigrams,
00144                                  sizeof(*lm3g->unigrams) * base->n_1g_alloc);
00145     memset(lm3g->unigrams + base->n_counts[0], 0,
00146            (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->unigrams));
00147     /* Reallocate tginfo array. */
00148     lm3g->tginfo = ckd_realloc(lm3g->tginfo,
00149                                sizeof(*lm3g->tginfo) * base->n_1g_alloc);
00150     memset(lm3g->tginfo + base->n_counts[0], 0,
00151            (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->tginfo));
00152     /* FIXME: we really ought to update base->log_uniform *and*
00153      * renormalize all the other unigrams.  This is really slow, so I
00154      * will probably just provide a function to renormalize after
00155      * adding unigrams, for anyone who really cares. */
00156     /* This could be simplified but then we couldn't do it in logmath */
00157     score = lweight + base->log_uniform + base->log_uw;
00158     score = logmath_add(base->lmath, score,
00159                         base->log_uniform + base->log_uniform_weight);
00160     lm3g->unigrams[wid].prob1.l = score;
00161     /* This unigram by definition doesn't participate in any bigrams,
00162      * so its backoff weight and bigram pointer are both undefined. */
00163     lm3g->unigrams[wid].bo_wt1.l = logmath_get_zero(base->lmath);
00164     lm3g->unigrams[wid].bigrams = 0;
00165     /* Finally, increase the unigram count */
00166     ++base->n_counts[0];
00167     /* FIXME: Note that this can actually be quite bogus due to the
00168      * presence of class words.  If wid falls outside the unigram
00169      * count, increase it to compensate, at the cost of no longer
00170      * really knowing how many unigrams we have :( */
00171     if (wid >= base->n_counts[0])
00172         base->n_counts[0] = wid + 1;
00173 
00174     return score;
00175 }
00176 
00177 void
00178 init_sorted_list(sorted_list_t * l)
00179 {
00180     /* FIXME FIXME FIXME: Fixed size array!??! */
00181     l->list = ckd_calloc(MAX_SORTED_ENTRIES,
00182                          sizeof(sorted_entry_t));
00183     l->list[0].val.l = INT_MIN;
00184     l->list[0].lower = 0;
00185     l->list[0].higher = 0;
00186     l->free = 1;
00187 }
00188 
00189 void
00190 free_sorted_list(sorted_list_t * l)
00191 {
00192     free(l->list);
00193 }
00194 
00195 lmprob_t *
00196 vals_in_sorted_list(sorted_list_t * l)
00197 {
00198     lmprob_t *vals;
00199     int32 i;
00200 
00201     vals = ckd_calloc(l->free, sizeof(lmprob_t));
00202     for (i = 0; i < l->free; i++)
00203         vals[i] = l->list[i].val;
00204     return (vals);
00205 }
00206 
00207 int32
00208 sorted_id(sorted_list_t * l, int32 *val)
00209 {
00210     int32 i = 0;
00211 
00212     for (;;) {
00213         if (*val == l->list[i].val.l)
00214             return (i);
00215         if (*val < l->list[i].val.l) {
00216             if (l->list[i].lower == 0) {
00217                 if (l->free >= MAX_SORTED_ENTRIES) {
00218                     /* Make the best of a bad situation. */
00219                     E_WARN("sorted list overflow (%d => %d)\n",
00220                            *val, l->list[i].val.l);
00221                     return i;
00222                 }
00223 
00224                 l->list[i].lower = l->free;
00225                 (l->free)++;
00226                 i = l->list[i].lower;
00227                 l->list[i].val.l = *val;
00228                 return (i);
00229             }
00230             else
00231                 i = l->list[i].lower;
00232         }
00233         else {
00234             if (l->list[i].higher == 0) {
00235                 if (l->free >= MAX_SORTED_ENTRIES) {
00236                     /* Make the best of a bad situation. */
00237                     E_WARN("sorted list overflow (%d => %d)\n",
00238                            *val, l->list[i].val);
00239                     return i;
00240                 }
00241 
00242                 l->list[i].higher = l->free;
00243                 (l->free)++;
00244                 i = l->list[i].higher;
00245                 l->list[i].val.l = *val;
00246                 return (i);
00247             }
00248             else
00249                 i = l->list[i].higher;
00250         }
00251     }
00252 }
00253 

Generated on Tue Aug 17 2010 for SphinxBase by  doxygen 1.7.1