00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #include "config.h"
00044 #include "ngram_model.h"
00045 #include "ngram_model_internal.h"
00046 #include "ckd_alloc.h"
00047 #include "filename.h"
00048 #include "pio.h"
00049 #include "err.h"
00050 #include "logmath.h"
00051 #include "strfuncs.h"
00052 #include "case.h"
00053
00054 #include <string.h>
00055 #include <assert.h>
00056 #ifdef HAVE_ICONV
00057 #include <iconv.h>
00058 #endif
00059
00060 ngram_file_type_t
00061 ngram_file_name_to_type(const char *file_name)
00062 {
00063 const char *ext;
00064
00065 ext = strrchr(file_name, '.');
00066 if (ext == NULL) {
00067 return NGRAM_ARPA;
00068 }
00069 if (0 == strcmp_nocase(ext, ".gz")) {
00070 while (--ext >= file_name) {
00071 if (*ext == '.') break;
00072 }
00073 if (ext < file_name) {
00074 return NGRAM_ARPA;
00075 }
00076 }
00077
00078 if (0 == strncmp_nocase(ext, ".ARPA", 5))
00079 return NGRAM_ARPA;
00080 if (0 == strncmp_nocase(ext, ".DMP32", 6))
00081 return NGRAM_DMP32;
00082 if (0 == strncmp_nocase(ext, ".DMP", 4))
00083 return NGRAM_DMP;
00084 return NGRAM_ARPA;
00085 }
00086
00087 ngram_model_t *
00088 ngram_model_read(cmd_ln_t *config,
00089 const char *file_name,
00090 ngram_file_type_t file_type,
00091 logmath_t *lmath)
00092 {
00093 ngram_model_t *model = NULL;
00094
00095 switch (file_type) {
00096 case NGRAM_AUTO: {
00097 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00098 break;
00099 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00100 break;
00101 if ((model = ngram_model_dmp32_read(config, file_name, lmath)) != NULL)
00102 break;
00103 return NULL;
00104 }
00105 case NGRAM_ARPA:
00106 model = ngram_model_arpa_read(config, file_name, lmath);
00107 break;
00108 case NGRAM_DMP:
00109 model = ngram_model_dmp_read(config, file_name, lmath);
00110 break;
00111 case NGRAM_DMP32:
00112 model = ngram_model_dmp32_read(config, file_name, lmath);
00113 break;
00114 }
00115
00116
00117 if (config) {
00118 float32 lw = 1.0;
00119 float32 wip = 1.0;
00120 float32 uw = 1.0;
00121
00122 if (cmd_ln_exists_r(config, "-lw"))
00123 lw = cmd_ln_float32_r(config, "-lw");
00124 if (cmd_ln_exists_r(config, "-wip"))
00125 wip = cmd_ln_float32_r(config, "-wip");
00126 if (cmd_ln_exists_r(config, "-uw"))
00127 uw = cmd_ln_float32_r(config, "-uw");
00128
00129 ngram_model_apply_weights(model, lw, wip, uw);
00130 }
00131
00132 return model;
00133 }
00134
00135 int
00136 ngram_model_write(ngram_model_t *model, const char *file_name,
00137 ngram_file_type_t file_type)
00138 {
00139 switch (file_type) {
00140 case NGRAM_AUTO: {
00141 file_type = ngram_file_name_to_type(file_name);
00142 return ngram_model_write(model, file_name, file_type);
00143 }
00144 case NGRAM_ARPA:
00145 return ngram_model_arpa_write(model, file_name);
00146 case NGRAM_DMP:
00147 return ngram_model_dmp_write(model, file_name);
00148 case NGRAM_DMP32:
00149 return ngram_model_dmp32_write(model, file_name);
00150 }
00151
00152 return -1;
00153 }
00154
00155 int32
00156 ngram_model_init(ngram_model_t *base,
00157 ngram_funcs_t *funcs,
00158 logmath_t *lmath,
00159 int32 n, int32 n_unigram)
00160 {
00161 base->refcount = 1;
00162 base->funcs = funcs;
00163 base->n = n;
00164
00165 if (base->n_counts == NULL)
00166 base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00167
00168 if (base->lmath != lmath) {
00169
00170 base->lw = 1.0;
00171 base->log_wip = 0;
00172 base->log_uw = 0;
00173 base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00174 base->log_uniform_weight = logmath_get_zero(lmath);
00175 base->log_zero = logmath_get_zero(lmath);
00176 base->lmath = lmath;
00177 }
00178
00179 if (base->word_str) {
00180
00181 if (base->writable) {
00182 int32 i;
00183 for (i = 0; i < base->n_words; ++i) {
00184 ckd_free(base->word_str[i]);
00185 base->word_str[i] = NULL;
00186 }
00187 }
00188 base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00189 }
00190 else
00191 base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00192
00193
00194 if (base->wid)
00195 hash_table_empty(base->wid);
00196 else
00197 base->wid = hash_table_new(n_unigram, FALSE);
00198 base->n_1g_alloc = base->n_words = n_unigram;
00199
00200 return 0;
00201 }
00202
00203 ngram_model_t *
00204 ngram_model_retain(ngram_model_t *model)
00205 {
00206 ++model->refcount;
00207 return model;
00208 }
00209
00210
00211 void
00212 ngram_model_flush(ngram_model_t *model)
00213 {
00214 if (model->funcs && model->funcs->flush)
00215 (*model->funcs->flush)(model);
00216 }
00217
00218 int
00219 ngram_model_free(ngram_model_t *model)
00220 {
00221 int i;
00222
00223 if (model == NULL)
00224 return 0;
00225 if (--model->refcount > 0)
00226 return model->refcount;
00227 if (model->funcs && model->funcs->free)
00228 (*model->funcs->free)(model);
00229 if (model->writable) {
00230
00231 for (i = 0; i < model->n_words; ++i) {
00232 ckd_free(model->word_str[i]);
00233 }
00234 }
00235 else {
00236
00237 for (i = 0; i < model->n_classes; ++i) {
00238 ngram_class_t *lmclass;
00239 int32 j;
00240
00241 lmclass = model->classes[i];
00242 for (j = 0; j < lmclass->n_words; ++j) {
00243 ckd_free(model->word_str[lmclass->start_wid + j]);
00244 }
00245 for (j = 0; j < lmclass->n_hash; ++j) {
00246 if (lmclass->nword_hash[j].wid != -1) {
00247 ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00248 }
00249 }
00250 }
00251 }
00252 for (i = 0; i < model->n_classes; ++i) {
00253 ngram_class_free(model->classes[i]);
00254 }
00255 ckd_free(model->classes);
00256 hash_table_free(model->wid);
00257 ckd_free(model->word_str);
00258 ckd_free(model->n_counts);
00259 ckd_free(model);
00260 return 0;
00261 }
00262
00263
00264 #ifdef HAVE_ICONV
00265 int
00266 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00267 {
00268 iconv_t ic;
00269 char *outbuf;
00270 size_t maxlen;
00271 int i, writable;
00272 hash_table_t *new_wid;
00273
00274
00275
00276 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00277 E_ERROR_SYSTEM("iconv_open() failed");
00278 return -1;
00279 }
00280
00281
00282
00283
00284
00285
00286
00287 maxlen = 0;
00288 for (i = 0; i < model->n_words; ++i) {
00289 if (strlen(model->word_str[i]) > maxlen)
00290 maxlen = strlen(model->word_str[i]);
00291 }
00292
00293 writable = model->writable;
00294
00295 model->writable = TRUE;
00296
00297 maxlen = maxlen * sizeof(int) + 15;
00298 outbuf = ckd_calloc(maxlen, 1);
00299
00300
00301 new_wid = hash_table_new(model->n_words, FALSE);
00302 for (i = 0; i < model->n_words; ++i) {
00303 ICONV_CONST char *in;
00304 char *out;
00305 size_t inleft, outleft, result;
00306
00307 start_conversion:
00308 in = (ICONV_CONST char *)model->word_str[i];
00309
00310 inleft = strlen(in);
00311 out = outbuf;
00312 outleft = maxlen;
00313
00314 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00315 if (errno != E2BIG) {
00316
00317
00318 E_ERROR_SYSTEM("iconv() failed");
00319 ckd_free(outbuf);
00320 hash_table_free(new_wid);
00321 return -1;
00322 }
00323
00324 iconv(ic, NULL, NULL, NULL, NULL);
00325
00326 maxlen *= 2;
00327 out = outbuf = ckd_realloc(outbuf, maxlen);
00328
00329 in = (ICONV_CONST char *)model->word_str[i];
00330 inleft = strlen(in);
00331 }
00332
00333
00334 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00335 if (errno != E2BIG) {
00336
00337
00338 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00339 ckd_free(outbuf);
00340 hash_table_free(new_wid);
00341 return -1;
00342 }
00343
00344 iconv(ic, NULL, NULL, NULL, NULL);
00345
00346 maxlen *= 2;
00347 outbuf = ckd_realloc(outbuf, maxlen);
00348
00349 goto start_conversion;
00350 }
00351
00352 result = maxlen - outleft;
00353
00354 if (writable) {
00355
00356 model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00357 model->word_str[i][result] = '\0';
00358 }
00359 else {
00360
00361 model->word_str[i] = ckd_calloc(result + 1, 1);
00362 }
00363
00364 memcpy(model->word_str[i], outbuf, result);
00365
00366
00367
00368
00369 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00370 E_WARN("Duplicate word in dictionary after conversion: %s\n",
00371 model->word_str[i]);
00372 }
00373 }
00374 ckd_free(outbuf);
00375 iconv_close(ic);
00376
00377 hash_table_free(model->wid);
00378 model->wid = new_wid;
00379
00380 return 0;
00381 }
00382 #else
00383 int
00384 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00385 {
00386 return -1;
00387 }
00388 #endif
00389
00390 int
00391 ngram_model_apply_weights(ngram_model_t *model,
00392 float32 lw, float32 wip, float32 uw)
00393 {
00394 return (*model->funcs->apply_weights)(model, lw, wip, uw);
00395 }
00396
00397 float32
00398 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00399 int32 *out_log_uw)
00400 {
00401 if (out_log_wip) *out_log_wip = model->log_wip;
00402 if (out_log_uw) *out_log_uw = model->log_uw;
00403 return model->lw;
00404 }
00405
00406
00407 int32
00408 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00409 int32 n_hist, int32 *n_used)
00410 {
00411 int32 score, class_weight = 0;
00412 int i;
00413
00414
00415 if (wid == NGRAM_INVALID_WID)
00416 return model->log_zero;
00417
00418
00419 if (NGRAM_IS_CLASSWID(wid)) {
00420 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00421
00422 class_weight = ngram_class_prob(lmclass, wid);
00423 if (class_weight == 0)
00424
00425
00426 return model->log_zero;
00427 wid = lmclass->tag_wid;
00428 }
00429 for (i = 0; i < n_hist; ++i) {
00430 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00431 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00432 }
00433 score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00434
00435
00436 return score + class_weight;
00437 }
00438
00439 int32
00440 ngram_score(ngram_model_t *model, const char *word, ...)
00441 {
00442 va_list history;
00443 const char *hword;
00444 int32 *histid;
00445 int32 n_hist;
00446 int32 n_used;
00447 int32 prob;
00448
00449 va_start(history, word);
00450 n_hist = 0;
00451 while ((hword = va_arg(history, const char *)) != NULL)
00452 ++n_hist;
00453 va_end(history);
00454
00455 histid = ckd_calloc(n_hist, sizeof(*histid));
00456 va_start(history, word);
00457 n_hist = 0;
00458 while ((hword = va_arg(history, const char *)) != NULL) {
00459 histid[n_hist] = ngram_wid(model, hword);
00460 ++n_hist;
00461 }
00462 va_end(history);
00463
00464 prob = ngram_ng_score(model, ngram_wid(model, word),
00465 histid, n_hist, &n_used);
00466 ckd_free(histid);
00467 return prob;
00468 }
00469
00470 int32
00471 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00472 {
00473 int32 hist[2] = { w2, w1 };
00474 return ngram_ng_score(model, w3, hist, 2, n_used);
00475 }
00476
00477 int32
00478 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00479 {
00480 return ngram_ng_score(model, w2, &w1, 1, n_used);
00481 }
00482
00483 int32
00484 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00485 int32 n_hist, int32 *n_used)
00486 {
00487 int32 prob, class_weight = 0;
00488 int i;
00489
00490
00491 if (wid == NGRAM_INVALID_WID)
00492 return model->log_zero;
00493
00494
00495 if (NGRAM_IS_CLASSWID(wid)) {
00496 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00497
00498 class_weight = ngram_class_prob(lmclass, wid);
00499 if (class_weight == model->log_zero)
00500 return class_weight;
00501 wid = lmclass->tag_wid;
00502 }
00503 for (i = 0; i < n_hist; ++i) {
00504 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00505 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00506 }
00507 prob = (*model->funcs->raw_score)(model, wid, history,
00508 n_hist, n_used);
00509
00510 return prob + class_weight;
00511 }
00512
00513 int32
00514 ngram_prob(ngram_model_t *model, const char *word, ...)
00515 {
00516 va_list history;
00517 const char *hword;
00518 int32 *histid;
00519 int32 n_hist;
00520 int32 n_used;
00521 int32 prob;
00522
00523 va_start(history, word);
00524 n_hist = 0;
00525 while ((hword = va_arg(history, const char *)) != NULL)
00526 ++n_hist;
00527 va_end(history);
00528
00529 histid = ckd_calloc(n_hist, sizeof(*histid));
00530 va_start(history, word);
00531 n_hist = 0;
00532 while ((hword = va_arg(history, const char *)) != NULL) {
00533 histid[n_hist] = ngram_wid(model, hword);
00534 ++n_hist;
00535 }
00536 va_end(history);
00537
00538 prob = ngram_ng_prob(model, ngram_wid(model, word),
00539 histid, n_hist, &n_used);
00540 ckd_free(histid);
00541 return prob;
00542 }
00543
00544 int32
00545 ngram_score_to_prob(ngram_model_t *base, int32 score)
00546 {
00547 int32 prob;
00548
00549
00550 prob = score - base->log_wip;
00551
00552 prob = (int32)(prob / base->lw);
00553
00554 return prob;
00555 }
00556
00557 int32
00558 ngram_unknown_wid(ngram_model_t *model)
00559 {
00560 int32 val;
00561
00562
00563
00564 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00565 return NGRAM_INVALID_WID;
00566 else
00567 return val;
00568 }
00569
00570 int32
00571 ngram_zero(ngram_model_t *model)
00572 {
00573 return model->log_zero;
00574 }
00575
00576 int32
00577 ngram_model_get_size(ngram_model_t *model)
00578 {
00579 if (model != NULL)
00580 return model->n;
00581 return 0;
00582 }
00583
00584 int32 const *
00585 ngram_model_get_counts(ngram_model_t *model)
00586 {
00587 if (model != NULL)
00588 return model->n_counts;
00589 return NULL;
00590 }
00591
00592 int32
00593 ngram_wid(ngram_model_t *model, const char *word)
00594 {
00595 int32 val;
00596
00597 if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00598 return ngram_unknown_wid(model);
00599 else
00600 return val;
00601 }
00602
00603 const char *
00604 ngram_word(ngram_model_t *model, int32 wid)
00605 {
00606
00607 wid = NGRAM_BASEWID(wid);
00608 if (wid >= model->n_words)
00609 return NULL;
00610 return model->word_str[wid];
00611 }
00612
00616 int32
00617 ngram_add_word_internal(ngram_model_t *model,
00618 const char *word,
00619 int32 classid)
00620 {
00621 void *dummy;
00622 int32 wid;
00623
00624
00625 wid = model->n_words;
00626 if (classid >= 0) {
00627 wid = NGRAM_CLASSWID(wid, classid);
00628 }
00629
00630 if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00631 E_ERROR("Duplicate definition of word %s\n", word);
00632 return NGRAM_INVALID_WID;
00633 }
00634
00635 if (model->n_words >= model->n_1g_alloc) {
00636 model->n_1g_alloc += UG_ALLOC_STEP;
00637 model->word_str = ckd_realloc(model->word_str,
00638 sizeof(*model->word_str) * model->n_1g_alloc);
00639 }
00640
00641
00642 model->word_str[model->n_words] = ckd_salloc(word);
00643
00644 if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00645 E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00646 model->word_str[model->n_words], (void *)(long)(wid));
00647 }
00648
00649 ++model->n_words;
00650 return wid;
00651 }
00652
00653 int32
00654 ngram_model_add_word(ngram_model_t *model,
00655 const char *word, float32 weight)
00656 {
00657 int32 wid, prob = model->log_zero;
00658
00659 wid = ngram_add_word_internal(model, word, -1);
00660 if (wid == NGRAM_INVALID_WID)
00661 return wid;
00662
00663
00664 if (model->funcs && model->funcs->add_ug)
00665 prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00666 if (prob == 0) {
00667 if (model->writable)
00668 ckd_free(model->word_str[wid]);
00669 return -1;
00670 }
00671 return wid;
00672 }
00673
00674 ngram_class_t *
00675 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00676 {
00677 ngram_class_t *lmclass;
00678 gnode_t *gn;
00679 float32 tprob;
00680 int i;
00681
00682 lmclass = ckd_calloc(1, sizeof(*lmclass));
00683 lmclass->tag_wid = tag_wid;
00684
00685 lmclass->start_wid = start_wid;
00686 lmclass->n_words = glist_count(classwords);
00687 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00688 lmclass->nword_hash = NULL;
00689 lmclass->n_hash = 0;
00690 tprob = 0.0;
00691 for (gn = classwords; gn; gn = gnode_next(gn)) {
00692 tprob += gnode_float32(gn);
00693 }
00694 if (tprob > 1.1 || tprob < 0.9) {
00695 E_WARN("Total class probability is %f, will normalize\n", tprob);
00696 for (gn = classwords; gn; gn = gnode_next(gn)) {
00697 gn->data.fl /= tprob;
00698 }
00699 }
00700 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00701 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00702 }
00703
00704 return lmclass;
00705 }
00706
00707 int32
00708 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00709 {
00710 int32 hash;
00711
00712 if (lmclass->nword_hash == NULL) {
00713
00714 lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00715 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00716 lmclass->n_hash = NGRAM_HASH_SIZE;
00717 lmclass->n_hash_inuse = 0;
00718 }
00719
00720
00721
00722 hash = wid & (lmclass->n_hash - 1);
00723 if (lmclass->nword_hash[hash].wid == -1) {
00724
00725 lmclass->nword_hash[hash].wid = wid;
00726 lmclass->nword_hash[hash].prob1 = lweight;
00727 ++lmclass->n_hash_inuse;
00728 return hash;
00729 }
00730 else {
00731 int32 next;
00732
00733 while (lmclass->nword_hash[hash].next != -1)
00734 hash = lmclass->nword_hash[hash].next;
00735 assert(hash != -1);
00736
00737 if (lmclass->n_hash_inuse == lmclass->n_hash) {
00738
00739 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
00740 lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00741 memset(lmclass->nword_hash + lmclass->n_hash,
00742 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00743
00744 next = lmclass->n_hash;
00745 lmclass->n_hash *= 2;
00746 }
00747 else {
00748
00749 for (next = 0; next < lmclass->n_hash; ++next)
00750 if (lmclass->nword_hash[next].wid == -1)
00751 break;
00752
00753 assert(next != lmclass->n_hash);
00754 }
00755 lmclass->nword_hash[next].wid = wid;
00756 lmclass->nword_hash[next].prob1 = lweight;
00757 lmclass->nword_hash[hash].next = next;
00758 ++lmclass->n_hash_inuse;
00759 return next;
00760 }
00761 }
00762
00763 void
00764 ngram_class_free(ngram_class_t *lmclass)
00765 {
00766 ckd_free(lmclass->nword_hash);
00767 ckd_free(lmclass->prob1);
00768 ckd_free(lmclass);
00769 }
00770
00771 int32
00772 ngram_model_add_class_word(ngram_model_t *model,
00773 const char *classname,
00774 const char *word,
00775 float32 weight)
00776 {
00777 ngram_class_t *lmclass;
00778 int32 classid, tag_wid, wid, i, scale;
00779 float32 fprob;
00780
00781
00782
00783
00784 tag_wid = ngram_wid(model, classname);
00785 if (tag_wid == NGRAM_INVALID_WID) {
00786 E_ERROR("No such word or class tag: %s\n", classname);
00787 return tag_wid;
00788 }
00789 for (classid = 0; classid < model->n_classes; ++classid) {
00790 if (model->classes[classid]->tag_wid == tag_wid)
00791 break;
00792 }
00793
00794 if (classid == model->n_classes) {
00795 E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00796 return NGRAM_INVALID_WID;
00797 }
00798 lmclass = model->classes[classid];
00799
00800
00801 wid = ngram_add_word_internal(model, word, classid);
00802 if (wid == NGRAM_INVALID_WID)
00803 return wid;
00804
00805
00806 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00807
00808
00809
00810 scale = logmath_log(model->lmath, 1.0 - fprob);
00811 for (i = 0; i < lmclass->n_words; ++i)
00812 lmclass->prob1[i] += scale;
00813 for (i = 0; i < lmclass->n_hash; ++i)
00814 if (lmclass->nword_hash[i].wid != -1)
00815 lmclass->nword_hash[i].prob1 += scale;
00816
00817
00818 return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00819 }
00820
00821 int32
00822 ngram_model_add_class(ngram_model_t *model,
00823 const char *classname,
00824 float32 classweight,
00825 char **words,
00826 const float32 *weights,
00827 int32 n_words)
00828 {
00829 ngram_class_t *lmclass;
00830 glist_t classwords = NULL;
00831 int32 i, start_wid = -1;
00832 int32 classid, tag_wid;
00833
00834
00835 if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
00836 tag_wid = ngram_model_add_word(model, classname, classweight);
00837 if (tag_wid == NGRAM_INVALID_WID)
00838 return -1;
00839 }
00840
00841 if (model->n_classes == 128) {
00842 E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
00843 return -1;
00844 }
00845 classid = model->n_classes;
00846 for (i = 0; i < n_words; ++i) {
00847 int32 wid;
00848
00849 wid = ngram_add_word_internal(model, words[i], classid);
00850 if (wid == NGRAM_INVALID_WID)
00851 return -1;
00852 if (start_wid == -1)
00853 start_wid = NGRAM_BASEWID(wid);
00854 classwords = glist_add_float32(classwords, weights[i]);
00855 }
00856 classwords = glist_reverse(classwords);
00857 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
00858 glist_free(classwords);
00859 if (lmclass == NULL)
00860 return -1;
00861
00862 ++model->n_classes;
00863 if (model->classes == NULL)
00864 model->classes = ckd_calloc(1, sizeof(*model->classes));
00865 else
00866 model->classes = ckd_realloc(model->classes,
00867 model->n_classes * sizeof(*model->classes));
00868 model->classes[classid] = lmclass;
00869 return classid;
00870 }
00871
00872 int32
00873 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
00874 {
00875 int32 base_wid = NGRAM_BASEWID(wid);
00876
00877 if (base_wid < lmclass->start_wid
00878 || base_wid > lmclass->start_wid + lmclass->n_words) {
00879 int32 hash;
00880
00881
00882 hash = wid & (lmclass->n_hash - 1);
00883 while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
00884 hash = lmclass->nword_hash[hash].next;
00885 if (hash == -1)
00886 return 0;
00887 return lmclass->nword_hash[hash].prob1;
00888 }
00889 else {
00890 return lmclass->prob1[base_wid - lmclass->start_wid];
00891 }
00892 }
00893
00894 int32
00895 read_classdef_file(hash_table_t *classes, const char *file_name)
00896 {
00897 FILE *fp;
00898 int32 is_pipe;
00899 int inclass;
00900 int32 rv = -1;
00901 gnode_t *gn;
00902 glist_t classwords = NULL;
00903 glist_t classprobs = NULL;
00904 char *classname = NULL;
00905
00906 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
00907 E_ERROR("File %s not found\n", file_name);
00908 return -1;
00909 }
00910
00911 inclass = FALSE;
00912 while (!feof(fp)) {
00913 char line[512];
00914 char *wptr[2];
00915 int n_words;
00916
00917 if (fgets(line, sizeof(line), fp) == NULL)
00918 break;
00919
00920 n_words = str2words(line, wptr, 2);
00921 if (n_words <= 0)
00922 continue;
00923
00924 if (inclass) {
00925
00926 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
00927 classdef_t *classdef;
00928 gnode_t *word, *weight;
00929 int32 i;
00930
00931 if (classname == NULL || 0 != strcmp(wptr[1], classname))
00932 goto error_out;
00933 inclass = FALSE;
00934
00935
00936 classdef = ckd_calloc(1, sizeof(*classdef));
00937 classwords = glist_reverse(classwords);
00938 classprobs = glist_reverse(classprobs);
00939 classdef->n_words = glist_count(classwords);
00940 classdef->words = ckd_calloc(classdef->n_words,
00941 sizeof(*classdef->words));
00942 classdef->weights = ckd_calloc(classdef->n_words,
00943 sizeof(*classdef->weights));
00944 word = classwords;
00945 weight = classprobs;
00946 for (i = 0; i < classdef->n_words; ++i) {
00947 classdef->words[i] = gnode_ptr(word);
00948 classdef->weights[i] = gnode_float32(weight);
00949 word = gnode_next(word);
00950 weight = gnode_next(weight);
00951 }
00952
00953
00954 if (hash_table_enter(classes, classname, classdef) != classdef) {
00955 classdef_free(classdef);
00956 goto error_out;
00957 }
00958
00959
00960 glist_free(classwords);
00961 glist_free(classprobs);
00962 classwords = NULL;
00963 classprobs = NULL;
00964 classname = NULL;
00965 }
00966 else {
00967 float32 fprob;
00968
00969 if (n_words == 2)
00970 fprob = (float32)atof_c(wptr[1]);
00971 else
00972 fprob = 1.0f;
00973
00974 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
00975 classprobs = glist_add_float32(classprobs, fprob);
00976 }
00977 }
00978 else {
00979
00980 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
00981 if (inclass)
00982 goto error_out;
00983 inclass = TRUE;
00984 classname = ckd_salloc(wptr[1]);
00985 }
00986
00987 }
00988 }
00989 rv = 0;
00990
00991 error_out:
00992
00993 fclose_comp(fp, is_pipe);
00994 for (gn = classwords; gn; gn = gnode_next(gn))
00995 ckd_free(gnode_ptr(gn));
00996 glist_free(classwords);
00997 glist_free(classprobs);
00998 ckd_free(classname);
00999
01000 return rv;
01001 }
01002
01003 void
01004 classdef_free(classdef_t *classdef)
01005 {
01006 int32 i;
01007 for (i = 0; i < classdef->n_words; ++i)
01008 ckd_free(classdef->words[i]);
01009 ckd_free(classdef->words);
01010 ckd_free(classdef->weights);
01011 ckd_free(classdef);
01012 }
01013
01014
01015 int32
01016 ngram_model_read_classdef(ngram_model_t *model,
01017 const char *file_name)
01018 {
01019 hash_table_t *classes;
01020 glist_t hl = NULL;
01021 gnode_t *gn;
01022 int32 rv = -1;
01023
01024 classes = hash_table_new(0, FALSE);
01025 if (read_classdef_file(classes, file_name) < 0) {
01026 hash_table_free(classes);
01027 return -1;
01028 }
01029
01030
01031 hl = hash_table_tolist(classes, NULL);
01032 for (gn = hl; gn; gn = gnode_next(gn)) {
01033 hash_entry_t *he = gnode_ptr(gn);
01034 classdef_t *classdef = he->val;
01035
01036 if (ngram_model_add_class(model, he->key, 1.0,
01037 classdef->words,
01038 classdef->weights,
01039 classdef->n_words) < 0)
01040 goto error_out;
01041 }
01042 rv = 0;
01043
01044 error_out:
01045 for (gn = hl; gn; gn = gnode_next(gn)) {
01046 hash_entry_t *he = gnode_ptr(gn);
01047 ckd_free((char *)he->key);
01048 classdef_free(he->val);
01049 }
01050 glist_free(hl);
01051 hash_table_free(classes);
01052 return rv;
01053 }