• Main Page
  • Related Pages
  • Data Structures
  • Files
  • File List
  • Globals

src/libsphinxbase/lm/ngram_model_dmp.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model_dmp.c DMP format language models
00039  *
00040  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
00041  */
00042 
00043 #include "ckd_alloc.h"
00044 #include "ngram_model_dmp.h"
00045 #include "pio.h"
00046 #include "err.h"
00047 #include "byteorder.h"
00048 #include "listelem_alloc.h"
00049 
00050 #include <stdio.h>
00051 #include <string.h>
00052 #include <stdlib.h>
00053 #include <limits.h>
00054 
00055 static const char darpa_hdr[] = "Darpa Trigram LM";
00056 static ngram_funcs_t ngram_model_dmp_funcs;
00057 
00058 #define TSEG_BASE(m,b)          ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
00059 #define FIRST_BG(m,u)           ((m)->lm3g.unigrams[u].bigrams)
00060 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
00061 
00062 static unigram_t *
00063 new_unigram_table(int32 n_ug)
00064 {
00065     unigram_t *table;
00066     int32 i;
00067 
00068     table = ckd_calloc(n_ug, sizeof(unigram_t));
00069     for (i = 0; i < n_ug; i++) {
00070         table[i].prob1.f = -99.0;
00071         table[i].bo_wt1.f = -99.0;
00072     }
00073     return table;
00074 }
00075 
00076 ngram_model_t *
00077 ngram_model_dmp_read(cmd_ln_t *config,
00078                      const char *file_name,
00079                      logmath_t *lmath)
00080 {
00081     ngram_model_t *base;
00082     ngram_model_dmp_t *model;
00083     FILE *fp;
00084     int do_mmap, do_swap;
00085     int32 is_pipe;
00086     int32 i, j, k, vn, n, ts;
00087     int32 n_unigram;
00088     int32 n_bigram;
00089     int32 n_trigram;
00090     char str[1024];
00091     unigram_t *ugptr;
00092     bigram_t *bgptr;
00093     trigram_t *tgptr;
00094     char *tmp_word_str;
00095     char *map_base = NULL;
00096     size_t offset = 0, filesize;
00097 
00098     do_mmap = FALSE;
00099     if (config)
00100         do_mmap = cmd_ln_boolean_r(config, "-mmap");
00101 
00102     if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
00103         E_ERROR("Dump file %s not found\n", file_name);
00104         return NULL;
00105     }
00106 
00107     if (is_pipe && do_mmap) {
00108         E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
00109         do_mmap = 0;
00110     }
00111 
00112     do_swap = FALSE;
00113     fread(&k, sizeof(k), 1, fp);
00114     if (k != strlen(darpa_hdr)+1) {
00115         SWAP_INT32(&k);
00116         if (k != strlen(darpa_hdr)+1) {
00117             E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
00118             fclose(fp);
00119             return NULL;
00120         }
00121         do_swap = 1;
00122     }
00123     if (fread(str, sizeof(char), k, fp) != (size_t) k) {
00124         E_ERROR("Cannot read header\n");
00125         fclose_comp(fp, is_pipe);
00126         return NULL;
00127     }
00128     if (strncmp(str, darpa_hdr, k) != 0) {
00129         E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
00130         fclose(fp);
00131         return NULL;
00132     }
00133 
00134     if (do_mmap) {
00135         if (do_swap) {
00136             E_INFO
00137                 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
00138             do_mmap = 0;
00139         }
00140         else {
00141             E_INFO("Will use memory-mapped I/O for LM file\n");
00142 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
00143             E_FATAL("memory mapping is not supported at the moment.");
00144 #else
00145 #endif
00146         }
00147     }
00148 
00149     fread(&k, sizeof(k), 1, fp);
00150     if (do_swap) SWAP_INT32(&k);
00151     if (fread(str, sizeof(char), k, fp) != (size_t) k) {
00152         E_ERROR("Cannot read LM filename in header\n");
00153         fclose(fp);
00154         return NULL;
00155     }
00156 
00157     /* read version#, if present (must be <= 0) */
00158     fread(&vn, sizeof(vn), 1, fp);
00159     if (do_swap) SWAP_INT32(&vn);
00160     if (vn <= 0) {
00161         /* read and don't compare timestamps (we don't care) */
00162         fread(&ts, sizeof(ts), 1, fp);
00163         if (do_swap) SWAP_INT32(&ts);
00164 
00165         /* read and skip format description */
00166         for (;;) {
00167             fread(&k, sizeof(k), 1, fp);
00168             if (do_swap) SWAP_INT32(&k);
00169             if (k == 0)
00170                 break;
00171             if (fread(str, sizeof(char), k, fp) != (size_t) k) {
00172                 E_ERROR("fread(word) failed\n");
00173                 fclose(fp);
00174                 return NULL;
00175             }
00176         }
00177         /* read model->ucount */
00178         fread(&n_unigram, sizeof(n_unigram), 1, fp);
00179         if (do_swap) SWAP_INT32(&n_unigram);
00180     }
00181     else {
00182         n_unigram = vn;
00183     }
00184 
00185     /* read model->bcount, tcount */
00186     fread(&n_bigram, sizeof(n_bigram), 1, fp);
00187     if (do_swap) SWAP_INT32(&n_bigram);
00188     fread(&n_trigram, sizeof(n_trigram), 1, fp);
00189     if (do_swap) SWAP_INT32(&n_trigram);
00190     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
00191 
00192     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
00193     model = ckd_calloc(1, sizeof(*model));
00194     base = &model->base;
00195     if (n_trigram > 0)
00196         n = 3;
00197     else if (n_bigram > 0)
00198         n = 2;
00199     else
00200         n = 1;
00201     ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
00202     base->n_counts[0] = n_unigram;
00203     base->n_counts[1] = n_bigram;
00204     base->n_counts[2] = n_trigram;
00205 
00206     /* read unigrams (always in memory, as they contain dictionary
00207      * mappings that can't be precomputed, and also could have OOVs added) */
00208     model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
00209     ugptr = model->lm3g.unigrams;
00210     for (i = 0; i <= n_unigram; ++i) {
00211         /* Skip over the mapping ID, we don't care about it. */
00212         if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
00213             E_ERROR("fread(mapid[%d]) failed\n", i);
00214             ngram_model_free(base);
00215             fclose_comp(fp, is_pipe);
00216             return NULL;
00217         }
00218         /* Read the actual unigram structure. */
00219         if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1)  {
00220             E_ERROR("fread(unigrams) failed\n");
00221             ngram_model_free(base);
00222             fclose_comp(fp, is_pipe);
00223             return NULL;
00224         }
00225         /* Byte swap if necessary. */
00226         if (do_swap) {
00227             SWAP_INT32(&ugptr->prob1.l);
00228             SWAP_INT32(&ugptr->bo_wt1.l);
00229             SWAP_INT32(&ugptr->bigrams);
00230         }
00231         /* Convert values to log. */
00232         ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
00233         ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
00234         ++ugptr;
00235     }
00236     E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
00237 
00238     /* Now mmap() the file and read in the rest of the (read-only) stuff. */
00239     if (do_mmap) {
00240         offset = ftell(fp);
00241         fseek(fp, 0, SEEK_END);
00242         filesize = ftell(fp);
00243         fseek(fp, offset, SEEK_SET);
00244 
00245         /* Check for improper word alignment. */
00246         if (offset & 0x3) {
00247             E_WARN("-mmap specified, but tseg_base is not word-aligned.  Will not memory-map.\n");
00248             do_mmap = FALSE;
00249         }
00250         else {
00251             model->dump_mmap = mmio_file_read(file_name);
00252             if (model->dump_mmap == NULL) {
00253                 do_mmap = FALSE;
00254             }
00255             else {
00256                 map_base = mmio_file_ptr(model->dump_mmap);
00257             }
00258         }
00259     }
00260 
00261     /* read bigrams */
00262     if (do_mmap) {
00263         model->lm3g.bigrams = (bigram_t *) (map_base + offset);
00264         offset += (n_bigram + 1) * sizeof(bigram_t);
00265     }
00266     else {
00267         model->lm3g.bigrams =
00268             ckd_calloc(n_bigram + 1, sizeof(bigram_t));
00269         if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
00270             != (size_t) n_bigram + 1) {
00271             E_ERROR("fread(bigrams) failed\n");
00272             ngram_model_free(base);
00273             fclose_comp(fp, is_pipe);
00274             return NULL;
00275         }
00276         if (do_swap) {
00277             for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
00278                  i++, bgptr++) {
00279                 SWAP_INT16(&bgptr->wid);
00280                 SWAP_INT16(&bgptr->prob2);
00281                 SWAP_INT16(&bgptr->bo_wt2);
00282                 SWAP_INT16(&bgptr->trigrams);
00283             }
00284         }
00285     }
00286     E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
00287 
00288     /* read trigrams */
00289     if (n_trigram > 0) {
00290         if (do_mmap) {
00291             model->lm3g.trigrams = (trigram_t *) (map_base + offset);
00292             offset += n_trigram * sizeof(trigram_t);
00293         }
00294         else {
00295             model->lm3g.trigrams =
00296                 ckd_calloc(n_trigram, sizeof(trigram_t));
00297             if (fread
00298                 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
00299                 != (size_t) n_trigram) {
00300                 E_ERROR("fread(trigrams) failed\n");
00301                 ngram_model_free(base);
00302                 fclose_comp(fp, is_pipe);
00303                 return NULL;
00304             }
00305             if (do_swap) {
00306                 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
00307                      i++, tgptr++) {
00308                     SWAP_INT16(&tgptr->wid);
00309                     SWAP_INT16(&tgptr->prob3);
00310                 }
00311             }
00312         }
00313         E_INFO("%8d = LM.trigrams read\n", n_trigram);
00314         /* Initialize tginfo */
00315         model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
00316         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00317     }
00318 
00319     /* read n_prob2 and prob2 array (in memory) */
00320     if (do_mmap)
00321         fseek(fp, offset, SEEK_SET);
00322     fread(&k, sizeof(k), 1, fp);
00323     if (do_swap) SWAP_INT32(&k);
00324     model->lm3g.n_prob2 = k;
00325     model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
00326     if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
00327         E_ERROR("fread(prob2) failed\n");
00328         ngram_model_free(base);
00329         fclose_comp(fp, is_pipe);
00330         return NULL;
00331     }
00332     for (i = 0; i < k; i++) {
00333         if (do_swap)
00334             SWAP_INT32(&model->lm3g.prob2[i].l);
00335         /* Convert values to log. */
00336         model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
00337     }
00338     E_INFO("%8d = LM.prob2 entries read\n", k);
00339 
00340     /* read n_bo_wt2 and bo_wt2 array (in memory) */
00341     if (base->n > 2) {
00342         fread(&k, sizeof(k), 1, fp);
00343         if (do_swap) SWAP_INT32(&k);
00344         model->lm3g.n_bo_wt2 = k;
00345         model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
00346         if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
00347             E_ERROR("fread(bo_wt2) failed\n");
00348             ngram_model_free(base);
00349             fclose_comp(fp, is_pipe);
00350             return NULL;
00351         }
00352         for (i = 0; i < k; i++) {
00353             if (do_swap)
00354                 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
00355             /* Convert values to log. */
00356             model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
00357         }
00358         E_INFO("%8d = LM.bo_wt2 entries read\n", k);
00359     }
00360 
00361     /* read n_prob3 and prob3 array (in memory) */
00362     if (base->n > 2) {
00363         fread(&k, sizeof(k), 1, fp);
00364         if (do_swap) SWAP_INT32(&k);
00365         model->lm3g.n_prob3 = k;
00366         model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
00367         if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
00368             E_ERROR("fread(prob3) failed\n");
00369             ngram_model_free(base);
00370             fclose_comp(fp, is_pipe);
00371             return NULL;
00372         }
00373         for (i = 0; i < k; i++) {
00374             if (do_swap)
00375                 SWAP_INT32(&model->lm3g.prob3[i].l);
00376             /* Convert values to log. */
00377             model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
00378         }
00379         E_INFO("%8d = LM.prob3 entries read\n", k);
00380     }
00381 
00382     /* read tseg_base size and tseg_base */
00383     if (do_mmap)
00384         offset = ftell(fp);
00385     if (n_trigram > 0) {
00386         if (do_mmap) {
00387             memcpy(&k, map_base + offset, sizeof(k));
00388             offset += sizeof(int32);
00389             model->lm3g.tseg_base = (int32 *) (map_base + offset);
00390             offset += k * sizeof(int32);
00391         }
00392         else {
00393             k = (n_bigram + 1) / BG_SEG_SZ + 1;
00394             fread(&k, sizeof(k), 1, fp);
00395             if (do_swap) SWAP_INT32(&k);
00396             model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
00397             if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
00398                 (size_t) k) {
00399                 E_ERROR("fread(tseg_base) failed\n");
00400                 ngram_model_free(base);
00401                 fclose_comp(fp, is_pipe);
00402                 return NULL;
00403             }
00404             if (do_swap)
00405                 for (i = 0; i < k; i++)
00406                     SWAP_INT32(&model->lm3g.tseg_base[i]);
00407         }
00408         E_INFO("%8d = LM.tseg_base entries read\n", k);
00409     }
00410 
00411     /* read ascii word strings */
00412     if (do_mmap) {
00413         memcpy(&k, map_base + offset, sizeof(k));
00414         offset += sizeof(int32);
00415         tmp_word_str = (char *) (map_base + offset);
00416         offset += k;
00417     }
00418     else {
00419         base->writable = TRUE;
00420         fread(&k, sizeof(k), 1, fp);
00421         if (do_swap) SWAP_INT32(&k);
00422         tmp_word_str = ckd_calloc(k, sizeof(char));
00423         if (fread(tmp_word_str, sizeof(char), k, fp) != (size_t) k) {
00424             E_ERROR("fread(word-string) failed\n");
00425             ngram_model_free(base);
00426             fclose_comp(fp, is_pipe);
00427             return NULL;
00428         }
00429     }
00430 
00431     /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
00432     for (i = 0, j = 0; i < k; i++)
00433         if (tmp_word_str[i] == '\0')
00434             j++;
00435     if (j != n_unigram) {
00436         E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
00437                 j, n_unigram);
00438         ngram_model_free(base);
00439         fclose_comp(fp, is_pipe);
00440         return NULL;
00441     }
00442 
00443     /* Break up string just read into words */
00444     if (do_mmap) {
00445         j = 0;
00446         for (i = 0; i < n_unigram; i++) {
00447             base->word_str[i] = tmp_word_str + j;
00448             if (hash_table_enter(base->wid, base->word_str[i],
00449                                  (void *)(long)i) != (void *)(long)i) {
00450                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00451             }
00452             j += strlen(base->word_str[i]) + 1;
00453         }
00454     }
00455     else {
00456         j = 0;
00457         for (i = 0; i < n_unigram; i++) {
00458             base->word_str[i] = ckd_salloc(tmp_word_str + j);
00459             if (hash_table_enter(base->wid, base->word_str[i],
00460                                  (void *)(long)i) != (void *)(long)i) {
00461                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00462             }
00463             j += strlen(base->word_str[i]) + 1;
00464         }
00465         free(tmp_word_str);
00466     }
00467     E_INFO("%8d = ascii word strings read\n", i);
00468 
00469     fclose_comp(fp, is_pipe);
00470     return base;
00471 }
00472 
00473 int
00474 ngram_model_dmp_write(ngram_model_t *model,
00475                       const char *file_name)
00476 {
00477     return -1;
00478 }
00479 
00480 static int
00481 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
00482                               float32 wip, float32 uw)
00483 {
00484     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00485     lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
00486     return 0;
00487 }
00488 
00489 /* Locate a specific bigram within a bigram list */
00490 #define BINARY_SEARCH_THRESH    16
00491 static int32
00492 find_bg(bigram_t * bg, int32 n, int32 w)
00493 {
00494     int32 i, b, e;
00495 
00496     /* Binary search until segment size < threshold */
00497     b = 0;
00498     e = n;
00499     while (e - b > BINARY_SEARCH_THRESH) {
00500         i = (b + e) >> 1;
00501         if (bg[i].wid < w)
00502             b = i + 1;
00503         else if (bg[i].wid > w)
00504             e = i;
00505         else
00506             return i;
00507     }
00508 
00509     /* Linear search within narrowed segment */
00510     for (i = b; (i < e) && (bg[i].wid != w); i++);
00511     return ((i < e) ? i : -1);
00512 }
00513 
00514 static int32
00515 lm3g_bg_score(ngram_model_dmp_t *model,
00516               int32 lw1, int32 lw2, int32 *n_used)
00517 {
00518     int32 i, n, b, score;
00519     bigram_t *bg;
00520 
00521     if (lw1 < 0) {
00522         *n_used = 1;
00523         return model->lm3g.unigrams[lw2].prob1.l;
00524     }
00525 
00526     b = FIRST_BG(model, lw1);
00527     n = FIRST_BG(model, lw1 + 1) - b;
00528     bg = model->lm3g.bigrams + b;
00529 
00530     if ((i = find_bg(bg, n, lw2)) >= 0) {
00531         /* Access mode = bigram */
00532         *n_used = 2;
00533         score = model->lm3g.prob2[bg[i].prob2].l;
00534     }
00535     else {
00536         /* Access mode = unigram */
00537         *n_used = 1;
00538         score = model->lm3g.unigrams[lw1].bo_wt1.l + model->lm3g.unigrams[lw2].prob1.l;
00539     }
00540 
00541     return (score);
00542 }
00543 
00544 static void
00545 load_tginfo(ngram_model_dmp_t *model, int32 lw1, int32 lw2)
00546 {
00547     int32 i, n, b, t;
00548     bigram_t *bg;
00549     tginfo_t *tginfo;
00550 
00551     /* First allocate space for tg information for bg lw1,lw2 */
00552     tginfo = (tginfo_t *) listelem_malloc(model->lm3g.le);
00553     tginfo->w1 = lw1;
00554     tginfo->tg = NULL;
00555     tginfo->next = model->lm3g.tginfo[lw2];
00556     model->lm3g.tginfo[lw2] = tginfo;
00557 
00558     /* Locate bigram lw1,lw2 */
00559     b = model->lm3g.unigrams[lw1].bigrams;
00560     n = model->lm3g.unigrams[lw1 + 1].bigrams - b;
00561     bg = model->lm3g.bigrams + b;
00562 
00563     if ((n > 0) && ((i = find_bg(bg, n, lw2)) >= 0)) {
00564         tginfo->bowt = model->lm3g.bo_wt2[bg[i].bo_wt2].l;
00565 
00566         /* Find t = Absolute first trigram index for bigram lw1,lw2 */
00567         b += i;                 /* b = Absolute index of bigram lw1,lw2 on disk */
00568         t = FIRST_TG(model, b);
00569 
00570         tginfo->tg = model->lm3g.trigrams + t;
00571 
00572         /* Find #tg for bigram w1,w2 */
00573         tginfo->n_tg = FIRST_TG(model, b + 1) - t;
00574     }
00575     else {                      /* No bigram w1,w2 */
00576         tginfo->bowt = 0;
00577         tginfo->n_tg = 0;
00578     }
00579 }
00580 
00581 /* Similar to find_bg */
00582 static int32
00583 find_tg(trigram_t * tg, int32 n, int32 w)
00584 {
00585     int32 i, b, e;
00586 
00587     b = 0;
00588     e = n;
00589     while (e - b > BINARY_SEARCH_THRESH) {
00590         i = (b + e) >> 1;
00591         if (tg[i].wid < w)
00592             b = i + 1;
00593         else if (tg[i].wid > w)
00594             e = i;
00595         else
00596             return i;
00597     }
00598 
00599     for (i = b; (i < e) && (tg[i].wid != w); i++);
00600     return ((i < e) ? i : -1);
00601 }
00602 
00603 static int32
00604 lm3g_tg_score(ngram_model_dmp_t *model, int32 lw1,
00605               int32 lw2, int32 lw3, int32 *n_used)
00606 {
00607     ngram_model_t *base = &model->base;
00608     int32 i, n, score;
00609     trigram_t *tg;
00610     tginfo_t *tginfo, *prev_tginfo;
00611 
00612     if ((base->n < 3) || (lw1 < 0))
00613         return (lm3g_bg_score(model, lw2, lw3, n_used));
00614 
00615     prev_tginfo = NULL;
00616     for (tginfo = model->lm3g.tginfo[lw2]; tginfo; tginfo = tginfo->next) {
00617         if (tginfo->w1 == lw1)
00618             break;
00619         prev_tginfo = tginfo;
00620     }
00621 
00622     if (!tginfo) {
00623         load_tginfo(model, lw1, lw2);
00624         tginfo = model->lm3g.tginfo[lw2];
00625     }
00626     else if (prev_tginfo) {
00627         prev_tginfo->next = tginfo->next;
00628         tginfo->next = model->lm3g.tginfo[lw2];
00629         model->lm3g.tginfo[lw2] = tginfo;
00630     }
00631 
00632     tginfo->used = 1;
00633 
00634     /* Trigrams for w1,w2 now pointed to by tginfo */
00635     n = tginfo->n_tg;
00636     tg = tginfo->tg;
00637     if ((i = find_tg(tg, n, lw3)) >= 0) {
00638         /* Access mode = trigram */
00639         *n_used = 3;
00640         score = model->lm3g.prob3[tg[i].prob3].l;
00641     }
00642     else {
00643         score = tginfo->bowt + lm3g_bg_score(model, lw2, lw3, n_used);
00644     }
00645 
00646     return (score);
00647 }
00648 
00649 static int32
00650 ngram_model_dmp_score(ngram_model_t *base, int32 wid,
00651                       int32 *history, int32 n_hist,
00652                       int32 *n_used)
00653 {
00654     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00655     switch (n_hist) {
00656     case 0:
00657         /* Access mode: unigram */
00658         *n_used = 1;
00659         return model->lm3g.unigrams[wid].prob1.l;
00660     case 1:
00661         return lm3g_bg_score(model, history[0], wid, n_used);
00662     case 2:
00663     default:
00664         /* Anything greater than 2 is the same as a trigram for now. */
00665         return lm3g_tg_score(model, history[1], history[0], wid, n_used);
00666     }
00667 }
00668 
00669 static int32
00670 ngram_model_dmp_raw_score(ngram_model_t *base, int32 wid,
00671                           int32 *history, int32 n_hist,
00672                           int32 *n_used)
00673 {
00674     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00675     int32 score;
00676 
00677     switch (n_hist) {
00678     case 0:
00679         /* Access mode: unigram */
00680         *n_used = 1;
00681         /* Undo insertion penalty. */
00682         score = model->lm3g.unigrams[wid].prob1.l - base->log_wip;
00683         /* Undo language weight. */
00684         score = (int32)(score / base->lw);
00685         /* Undo unigram interpolation */
00686         if (strcmp(base->word_str[wid], "<s>") != 0) { /* FIXME: configurable start_sym */
00687             score = logmath_log(base->lmath,
00688                                 logmath_exp(base->lmath, score)
00689                                 - logmath_exp(base->lmath, 
00690                                               base->log_uniform + base->log_uniform_weight));
00691         }
00692         return score;
00693     case 1:
00694         score = lm3g_bg_score(model, history[0], wid, n_used);
00695         break;
00696     case 2:
00697     default:
00698         /* Anything greater than 2 is the same as a trigram for now. */
00699         score = lm3g_tg_score(model, history[1], history[0], wid, n_used);
00700         break;
00701     }
00702     /* FIXME (maybe): This doesn't undo unigram weighting in backoff cases. */
00703     return (int32)((score - base->log_wip) / base->lw);
00704 }
00705 
00706 static int32
00707 ngram_model_dmp_add_ug(ngram_model_t *base,
00708                        int32 wid, int32 lweight)
00709 {
00710     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00711     return lm3g_add_ug(base, &model->lm3g, wid, lweight);
00712 }
00713 
00714 static void
00715 ngram_model_dmp_free(ngram_model_t *base)
00716 {
00717     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00718 
00719     ckd_free(model->lm3g.unigrams);
00720     ckd_free(model->lm3g.prob2);
00721     if (model->dump_mmap) {
00722         mmio_file_unmap(model->dump_mmap);
00723     } 
00724     else {
00725         ckd_free(model->lm3g.bigrams);
00726         if (base->n > 2) {
00727             ckd_free(model->lm3g.trigrams);
00728             ckd_free(model->lm3g.tseg_base);
00729         }
00730     }
00731     if (base->n > 2) {
00732         ckd_free(model->lm3g.bo_wt2);
00733         ckd_free(model->lm3g.prob3);
00734     }
00735 
00736     lm3g_tginfo_free(base, &model->lm3g);
00737 }
00738 
00739 static void
00740 ngram_model_dmp_flush(ngram_model_t *base)
00741 {
00742     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00743     lm3g_tginfo_reset(base, &model->lm3g);
00744 }
00745 
00746 static ngram_funcs_t ngram_model_dmp_funcs = {
00747     ngram_model_dmp_free,          /* free */
00748     ngram_model_dmp_apply_weights, /* apply_weights */
00749     ngram_model_dmp_score,         /* score */
00750     ngram_model_dmp_raw_score,     /* raw_score */
00751     ngram_model_dmp_add_ug,        /* add_ug */
00752     ngram_model_dmp_flush          /* flush */
00753 };

Generated on Fri Jan 14 2011 for SphinxBase by  doxygen 1.7.1