SphinxBase  5prealpha
ngram_model_internal.h
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * \file ngram_model_internal.h Internal structures for N-Gram models
39  *
40  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
41  */
42 
43 #ifndef __NGRAM_MODEL_INTERNAL_H__
44 #define __NGRAM_MODEL_INTERNAL_H__
45 
46 #include "sphinxbase/ngram_model.h"
47 #include "sphinxbase/hash_table.h"
48 
55 struct ngram_model_s {
56  int refcount;
57  uint32 *n_counts;
58  int32 n_1g_alloc;
59  int32 n_words;
62  uint8 n;
63  uint8 n_classes;
64  uint8 writable;
65  uint8 flags;
68  float32 lw;
69  int32 log_wip;
70  int32 log_zero;
71  char **word_str;
73  int32 *tmp_wids;
75  struct ngram_funcs_s *funcs;
76 };
77 
81 struct ngram_class_s {
82  int32 tag_wid;
83  int32 start_wid;
84  int32 n_words;
85  int32 *prob1;
89  struct ngram_hash_s {
90  int32 wid;
91  int32 prob1;
92  int32 next;
93  } *nword_hash;
94  int32 n_hash;
95  int32 n_hash_inuse;
96 };
97 
98 #define NGRAM_MAX_ORDER 5
99 
100 #define NGRAM_HASH_SIZE 128
101 
102 #define NGRAM_BASEWID(wid) ((wid)&0xffffff)
103 #define NGRAM_CLASSID(wid) (((wid)>>24) & 0x7f)
104 #define NGRAM_CLASSWID(wid,classid) (((classid)<<24) | 0x80000000 | (wid))
105 #define NGRAM_IS_CLASSWID(wid) ((wid)&0x80000000)
106 
107 #define UG_ALLOC_STEP 10
108 
110 typedef struct ngram_funcs_s {
114  void (*free) (ngram_model_t * model);
118  int (*apply_weights) (ngram_model_t * model, float32 lw, float32 wip);
122  int32(*score) (ngram_model_t * model,
123  int32 wid,
124  int32 * history, int32 n_hist, int32 * n_used);
129  int32(*raw_score) (ngram_model_t * model,
130  int32 wid,
131  int32 * history, int32 n_hist, int32 * n_used);
143  int32(*add_ug) (ngram_model_t * model, int32 wid, int32 lweight);
144 
148  void (*flush) (ngram_model_t * model);
149 } ngram_funcs_t;
150 
154 typedef struct classdef_s {
155  char **words;
156  float32 *weights;
157  int32 n_words;
158 } classdef_t;
159 
163 int32
164 ngram_model_init(ngram_model_t * model,
165  ngram_funcs_t * funcs,
166  logmath_t * lmath, int32 n, int32 n_unigram);
167 
171 int32 read_classdef_file(hash_table_t * classes,
172  const char *classdef_file);
173 
177 void classdef_free(classdef_t * classdef);
178 
182 ngram_class_t *ngram_class_new(ngram_model_t * model, int32 tag_wid,
183  int32 start_wid, glist_t classwords);
184 
188 void ngram_class_free(ngram_class_t * lmclass);
189 
195 int32 ngram_class_prob(ngram_class_t * lmclass, int32 wid);
196 
197 #endif /* __NGRAM_MODEL_INTERNAL_H__ */
ngram_funcs_s::free
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
Definition: ngram_model_internal.h:114
ngram_funcs_s::score
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
Definition: ngram_model_internal.h:122
ngram_model_s::log_zero
int32 log_zero
Zero probability, cached here for quick lookup.
Definition: ngram_model_internal.h:70
ngram_model_s::n_classes
uint8 n_classes
Number of classes (maximum 128)
Definition: ngram_model_internal.h:63
ngram_model_s::n_counts
uint32 * n_counts
Counts for 1, 2, 3, ...
Definition: ngram_model_internal.h:57
ngram_model_s::n
uint8 n
This is an n-gram model (1, 2, 3, ...).
Definition: ngram_model_internal.h:62
ngram_model_s::lmath
logmath_t * lmath
Log-math object.
Definition: ngram_model_internal.h:67
classdef_s
One class definition from a classdef file.
Definition: ngram_model_internal.h:154
ngram_class_s::ngram_hash_s::prob1
int32 prob1
Probability for this word.
Definition: ngram_model_internal.h:91
ngram_class_s::n_hash
int32 n_hash
Number of buckets in nword_hash (power of 2)
Definition: ngram_model_internal.h:94
ngram_class_s::ngram_hash_s::wid
int32 wid
Word ID of this bucket.
Definition: ngram_model_internal.h:90
ngram_model_s::lw
float32 lw
Language model scaling factor.
Definition: ngram_model_internal.h:68
gnode_s
A node in a generic list.
Definition: glist.h:100
ngram_model_s::n_words
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words).
Definition: ngram_model_internal.h:59
ngram_model_s::classes
struct ngram_class_s ** classes
Word class definitions.
Definition: ngram_model_internal.h:74
ngram_funcs_s::raw_score
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
Definition: ngram_model_internal.h:129
ngram_model_s::funcs
struct ngram_funcs_s * funcs
Implementation-specific methods.
Definition: ngram_model_internal.h:75
ngram_class_s::n_words
int32 n_words
Number of base words for this class.
Definition: ngram_model_internal.h:84
ngram_class_s::tag_wid
int32 tag_wid
Base word ID for this class tag.
Definition: ngram_model_internal.h:82
ngram_model_s::log_wip
int32 log_wip
Log of word insertion penalty.
Definition: ngram_model_internal.h:69
ngram_class_s::ngram_hash_s::next
int32 next
Index of next bucket (or -1 for no collision)
Definition: ngram_model_internal.h:92
ngram_class_s::start_wid
int32 start_wid
Starting base word ID for this class' words.
Definition: ngram_model_internal.h:83
ngram_class_s::ngram_hash_s
Custom hash table for additional words.
Definition: ngram_model_internal.h:89
ngram_model_s::tmp_wids
int32 * tmp_wids
Temporary array of word IDs for ngram_model_get_ngram()
Definition: ngram_model_internal.h:73
ngram_class_s::n_hash_inuse
int32 n_hash_inuse
Number of words in nword_hash.
Definition: ngram_model_internal.h:95
hash_table_s
Definition: hash_table.h:159
ngram_model_s::wid
hash_table_t * wid
Mapping of unigram names to word IDs.
Definition: ngram_model_internal.h:72
ngram_funcs_s
Implementation-specific functions for operating on ngram_model_t objects.
Definition: ngram_model_internal.h:110
ngram_model_s::writable
uint8 writable
Are word strings writable?
Definition: ngram_model_internal.h:64
ngram_class_s
Implementation of ngram_class_t.
Definition: ngram_model_internal.h:81
ngram_model_s::word_str
char ** word_str
Unigram names.
Definition: ngram_model_internal.h:71
ngram_class_s::prob1
int32 * prob1
Probability table for base words.
Definition: ngram_model_internal.h:85
ngram_model.h
N-Gram language models.
ngram_model_s::refcount
int refcount
Reference count.
Definition: ngram_model_internal.h:56
ngram_model_s::flags
uint8 flags
Any other flags we might care about (FIXME: Merge this and writable)
Definition: ngram_model_internal.h:65
logmath_s
Definition: logmath.c:49
ngram_model_s::n_1g_alloc
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
Definition: ngram_model_internal.h:58
ngram_funcs_s::apply_weights
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip)
Implementation-specific function for applying language model weights.
Definition: ngram_model_internal.h:118
ngram_funcs_s::add_ug
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
Definition: ngram_model_internal.h:143
ngram_funcs_s::flush
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
Definition: ngram_model_internal.h:148
hash_table.h
Hash table implementation.
ngram_model_s
Common implementation of ngram_model_t.
Definition: ngram_model_internal.h:55