45 #include <sphinxbase/priority_queue.h> 46 #include <sphinxbase/byteorder.h> 48 #include "ngram_model_internal.h" 49 #include "ngrams_raw.h" 52 ngram_comparator(
const void *first_void,
const void *second_void)
54 static int order = -1;
55 uint32 *first, *second, *end;
57 if (first_void == NULL) {
59 order = *(
int *) second_void;
63 E_ERROR(
"Order for ngram comprator was not set\n");
69 for (; first != end; ++first, ++second) {
79 ngram_ord_comparator(
void *a_raw,
void *b_raw)
85 while (a_w_ptr < a->order && b_w_ptr < b->order) {
86 if (a->instance.words[a_w_ptr] == b->instance.words[b_w_ptr]) {
91 if (a->instance.words[a_w_ptr] < b->instance.words[b_w_ptr])
96 return b->order - a->order;
101 logmath_t * lmath,
int order,
int order_max,
107 char *wptr[NGRAM_MAX_ORDER + 1];
112 E_ERROR(
"Unexpected end of ARPA file. Failed to read %d-gram\n",
117 words_expected = order + 1;
121 NGRAM_MAX_ORDER + 1)) < words_expected) {
122 if ((*li)->buf[0] !=
'\0') {
123 E_WARN(
"Format error; %d-gram ignored: %s\n", order,
128 if (order == order_max) {
130 (
float *)
ckd_calloc(1,
sizeof(*raw_ngram->weights));
131 raw_ngram->weights[0] =
atof_c(wptr[0]);
132 if (raw_ngram->weights[0] > 0) {
133 E_WARN(
"%d-gram [%s] has positive probability. Zeroize\n",
135 raw_ngram->weights[0] = 0.0f;
137 raw_ngram->weights[0] =
141 float weight, backoff;
143 (
float *)
ckd_calloc(2,
sizeof(*raw_ngram->weights));
147 E_WARN(
"%d-gram [%s] has positive probability. Zeroize\n",
149 raw_ngram->weights[0] = 0.0f;
152 raw_ngram->weights[0] =
156 if (n == order + 1) {
157 raw_ngram->weights[1] = 0.0f;
160 backoff =
atof_c(wptr[order + 1]);
161 raw_ngram->weights[1] =
166 (uint32 *)
ckd_calloc(order,
sizeof(*raw_ngram->words));
167 for (word_out = raw_ngram->words + order - 1, i = 1;
168 word_out >= raw_ngram->words; --word_out, i++) {
177 int order,
int order_max)
179 char expected_header[20];
182 sprintf(expected_header,
"\\%d-grams:", order);
185 if (strcmp((*li)->buf, expected_header) == 0)
189 for (i = 0; i < count; i++) {
190 read_ngram_instance(li, wid, lmath, order, order_max,
191 &((*raw_ngrams)[i]));
195 ngram_comparator(NULL, &order);
196 qsort(*raw_ngrams, count,
sizeof(
ngram_raw_t), &ngram_comparator);
208 for (order_it = 2; order_it <= order; order_it++) {
209 ngrams_raw_read_order(&raw_ngrams[order_it - 2], li, wid, lmath,
210 counts[order_it - 1], order_it, order);
216 while (*li && strlen((*li)->buf) == 0) {
222 E_ERROR(
"ARPA file ends without end-mark\n");
224 if (strcmp((*li)->buf,
"\\end\\") != 0)
226 (
"Finished reading ARPA file. Expecting end mark but found [%s]\n",
233 read_dmp_weight_array(FILE * fp,
logmath_t * lmath, uint8 do_swap,
240 fread(&k,
sizeof(k), 1, fp);
245 fread(tmp_weight_arr,
sizeof(*tmp_weight_arr), k, fp);
246 for (i = 0; i < k; i++) {
248 SWAP_INT32(&tmp_weight_arr[i].l);
250 tmp_weight_arr[i].f =
254 for (i = 0; i < counts; i++) {
255 raw_ngrams[i].weights[weight_idx] =
256 tmp_weight_arr[(int) raw_ngrams[i].weights[weight_idx]].f;
261 #define BIGRAM_SEGMENT_SIZE 9 264 ngrams_raw_read_dmp(FILE * fp,
logmath_t * lmath, uint32 * counts,
265 int order, uint32 * unigram_next, uint8 do_swap)
269 uint16 *bigrams_next;
276 sizeof(*raw_ngrams[0]));
278 (uint16 *)
ckd_calloc((
size_t) (counts[1] + 1),
279 sizeof(*bigrams_next));
281 for (j = 0; j <= (int32) counts[1]; j++) {
282 uint16 wid, prob_idx, bo_idx;
285 fread(&wid,
sizeof(wid), 1, fp);
289 (uint32 *)
ckd_calloc(2,
sizeof(*raw_ngram->words));
290 raw_ngram->words[0] = (uint32) wid;
291 while (ngram_idx < counts[0] && j == unigram_next[ngram_idx]) {
294 raw_ngram->words[1] = (uint32) ngram_idx - 1;
296 (
float *)
ckd_calloc(2,
sizeof(*raw_ngram->weights));
297 fread(&prob_idx,
sizeof(prob_idx), 1, fp);
299 SWAP_INT16(&prob_idx);
300 raw_ngram->weights[0] = prob_idx + 0.5f;
301 fread(&bo_idx,
sizeof(bo_idx), 1, fp);
304 raw_ngram->weights[1] = bo_idx + 0.5f;
305 fread(&bigrams_next[j],
sizeof(bigrams_next[j]), 1, fp);
307 SWAP_INT16(&bigrams_next[j]);
309 assert(ngram_idx == counts[0]);
315 sizeof(*raw_ngrams[1]));
316 for (j = 0; j < (int32) counts[2]; j++) {
317 uint16 wid, prob_idx;
320 fread(&wid,
sizeof(wid), 1, fp);
324 (uint32 *)
ckd_calloc(3,
sizeof(*raw_ngram->words));
325 raw_ngram->words[0] = (uint32) wid;
327 (
float *)
ckd_calloc(1,
sizeof(*raw_ngram->weights));
328 fread(&prob_idx,
sizeof(prob_idx), 1, fp);
330 SWAP_INT16(&prob_idx);
331 raw_ngram->weights[0] = prob_idx + 0.5f;
336 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
342 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
345 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[2],
348 fread(&k,
sizeof(k), 1, fp);
351 tseg_base = (int32 *)
ckd_calloc(k,
sizeof(int32));
352 fread(tseg_base,
sizeof(int32), k, fp);
354 for (j = 0; j < (uint32) k; j++) {
355 SWAP_INT32(&tseg_base[j]);
359 for (j = 1; j <= counts[1]; j++) {
360 uint32 next_ngram_idx =
361 (uint32) (tseg_base[j >> BIGRAM_SEGMENT_SIZE] +
363 while (ngram_idx < next_ngram_idx) {
364 raw_ngrams[1][ngram_idx].words[1] =
365 raw_ngrams[0][j - 1].words[0];
366 raw_ngrams[1][ngram_idx].words[2] =
367 raw_ngrams[0][j - 1].words[1];
372 assert(ngram_idx == counts[2]);
378 ngram_comparator(NULL, &i);
379 qsort(raw_ngrams[0], (
size_t) counts[1],
sizeof(*raw_ngrams[0]),
383 ngram_comparator(NULL, &i);
384 qsort(raw_ngrams[1], (
size_t) counts[2],
sizeof(*raw_ngrams[1]),
391 ngrams_raw_fix_counts(
ngram_raw_t ** raw_ngrams, uint32 * counts,
392 uint32 * fixed_counts,
int order)
395 priority_queue_create(order - 1, &ngram_ord_comparator);
396 uint32 raw_ngram_ptrs[NGRAM_MAX_ORDER - 1];
397 uint32 words[NGRAM_MAX_ORDER];
400 memset(words, -1,
sizeof(words));
401 memcpy(fixed_counts, counts, order *
sizeof(*fixed_counts));
402 for (i = 2; i <= order; i++) {
405 if (counts[i - 1] <= 0)
409 tmp_ngram->order = i;
410 raw_ngram_ptrs[i - 2] = 0;
411 tmp_ngram->instance = raw_ngrams[i - 2][0];
412 priority_queue_add(ngrams, tmp_ngram);
416 int32 to_increment = TRUE;
418 if (priority_queue_size(ngrams) == 0) {
422 if (top->order == 2) {
423 memcpy(words, top->instance.words, 2 *
sizeof(*words));
426 for (i = 0; i < top->order - 1; i++) {
427 if (words[i] != top->instance.words[i]) {
429 num = (i == 0) ? 1 : i;
430 memcpy(words, top->instance.words,
431 (num + 1) *
sizeof(*words));
433 to_increment = FALSE;
437 words[top->order - 1] = top->instance.words[top->order - 1];
440 raw_ngram_ptrs[top->order - 2]++;
442 if (raw_ngram_ptrs[top->order - 2] < counts[top->order - 1]) {
444 raw_ngrams[top->order - 2][raw_ngram_ptrs[top->order - 2]];
445 priority_queue_add(ngrams, top);
452 assert(priority_queue_size(ngrams) == 0);
453 priority_queue_free(ngrams, NULL);
457 ngrams_raw_free(
ngram_raw_t ** raw_ngrams, uint32 * counts,
int order)
462 for (order_it = 0; order_it < order - 1; order_it++) {
463 for (num = 0; num < counts[order_it + 1]; num++) {
464 ckd_free(raw_ngrams[order_it][num].weights);
465 ckd_free(raw_ngrams[order_it][num].words);
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Miscellaneous useful string functions.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define E_ERROR(...)
Print error message to error log.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Implementation of logging routines.
#define E_WARN(...)
Print warning message to error log.
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
file IO related operations.