SphinxBase  5prealpha
sphinx_fe.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <time.h>
41 #include <assert.h>
42 
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46 
47 #include <sphinxbase/fe.h>
48 #include <sphinxbase/strfuncs.h>
49 #include <sphinxbase/pio.h>
50 #include <sphinxbase/filename.h>
51 #include <sphinxbase/cmd_ln.h>
52 #include <sphinxbase/err.h>
53 #include <sphinxbase/ckd_alloc.h>
54 #include <sphinxbase/byteorder.h>
55 #include <sphinxbase/hash_table.h>
56 
57 #include "sphinx_wave2feat.h"
58 #include "cmd_ln_defn.h"
59 
60 typedef struct audio_type_s {
61  char const *name;
62  int (*detect)(sphinx_wave2feat_t *wtf);
63  int (*decode)(sphinx_wave2feat_t *wtf);
64 } audio_type_t;
65 
66 typedef struct output_type_s {
67  char const *name;
68  int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
69  int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
71 
73  int refcount;
75  fe_t *fe;
76  char *infile;
77  char *outfile;
78  FILE *infh;
79  FILE *outfh;
80  short *audio;
81  mfcc_t **feat;
82  int blocksize;
83  int featsize;
84  int veclen;
85  int in_veclen;
86  int byteswap;
87  output_type_t const *ot;
88 };
89 
91 typedef struct RIFFHeader{
92  char rifftag[4]; /* "RIFF" string */
93  int32 TotalLength; /* Total length */
94  char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
95  int32 RemainingLength; /* Remaining length */
96  int16 data_format; /* data format tag, 1 = PCM */
97  int16 numchannels; /* Number of channels in file */
98  int32 SamplingFreq; /* Sampling frequency */
99  int32 BytesPerSec; /* Average bytes/sec */
100  int16 BlockAlign; /* Block align */
101  int16 BitsPerSample; /* 8 or 16 bit */
102  char datatag[4]; /* "data" string */
103  int32 datalength; /* Raw data length */
104 } MSWAV_hdr;
105 
111 static int
112 detect_riff(sphinx_wave2feat_t *wtf)
113 {
114  FILE *fh;
115  MSWAV_hdr hdr;
116 
117  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
118  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
119  return -1;
120  }
121  if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
122  E_ERROR("Failed to read RIFF header");
123  fclose(fh);
124  return -1;
125  }
126  /* Make sure it is actually a RIFF file. */
127  if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
128  fclose(fh);
129  return FALSE;
130  }
131 
132  /* Get relevant information. */
133  cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
134  cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
135  wtf->infh = fh;
136 
137  return TRUE;
138 }
139 
140 static int
141 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
142 {
143  char nist[7];
144  lineiter_t *li;
145  FILE *fh;
146 
147  if ((fh = fopen(infile, "rb")) == NULL) {
148  E_ERROR_SYSTEM("Failed to open %s", infile);
149  return -1;
150  }
151  if (fread(&nist, 1, 7, fh) != 7) {
152  E_ERROR_SYSTEM("Failed to read NIST header");
153  fclose(fh);
154  return -1;
155  }
156  /* Is this actually a NIST file? */
157  if (0 != strncmp(nist, "NIST_1A", 7)) {
158  fclose(fh);
159  return FALSE;
160  }
161  /* Rewind, parse lines. */
162  fseek(fh, 0, SEEK_SET);
163  for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
164  char **words;
165  int nword;
166 
167  string_trim(li->buf, STRING_BOTH);
168  if (strlen(li->buf) == 0) {
169  lineiter_free(li);
170  break;
171  }
172  nword = str2words(li->buf, NULL, 0);
173  if (nword != 3)
174  continue;
175  words = (char **)ckd_calloc(nword, sizeof(*words));
176  str2words(li->buf, words, nword);
177  if (0 == strcmp(words[0], "sample_rate")) {
178  cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
179  }
180  if (0 == strcmp(words[0], "channel_count")) {
181  cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
182  }
183  if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
184  cmd_ln_set_str_r(wtf->config, "-input_endian",
185  (0 == strcmp(words[2], "10")) ? "big" : "little");
186  }
187  ckd_free(words);
188  }
189 
190  fseek(fh, 1024, SEEK_SET);
191  if (out_fh)
192  *out_fh = fh;
193  else
194  fclose(fh);
195  return TRUE;
196 }
197 
198 #ifdef HAVE_POPEN
199 static int
200 detect_sph2pipe(sphinx_wave2feat_t *wtf)
201 {
202  FILE *fh;
203  char *cmdline;
204  int rv;
205 
206  /* Determine if it's NIST file and get parameters. */
207  if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
208  return rv;
209 
210  /* Now popen it with sph2pipe. */
211  cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
212  if ((fh = popen(cmdline, "r")) == NULL) {
213  E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
214  ckd_free(cmdline);
215  return -1;
216  }
217 
218  wtf->infh = fh;
219  return TRUE;
220 }
221 #else /* !HAVE_POPEN */
222 static int
223 detect_sph2pipe(sphinx_wave2feat_t *wtf)
224 {
225  E_ERROR("popen() not available, cannot run sph2pipe\n");
226  return -1;
227 }
228 #endif /* !HAVE_POPEN */
229 
235 static int
236 detect_nist(sphinx_wave2feat_t *wtf)
237 {
238  FILE *fh;
239  int rv;
240 
241  if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
242  return rv;
243  wtf->infh = fh;
244 
245  return TRUE;
246 }
247 
248 
255 static int
256 detect_raw(sphinx_wave2feat_t *wtf)
257 {
258  FILE *fh;
259 
260  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
261  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
262  return -1;
263  }
264  wtf->infh = fh;
265  return TRUE;
266 }
267 
274 static int
275 detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
276 {
277  FILE *fh;
278  int32 len;
279  long flen;
280 
281  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
282  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
283  return -1;
284  }
285  if (fread(&len, 4, 1, fh) != 1) {
286  E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
287  fclose(fh);
288  return -1;
289  }
290  fseek(fh, 0, SEEK_END);
291  flen = ftell(fh);
292 
293  /* figure out whether to byteswap */
294  flen = (flen / 4) - 1;
295  if (flen != len) {
296  /* First make sure this is an endianness problem, otherwise fail. */
297  SWAP_INT32(&len);
298  if (flen != len) {
299  SWAP_INT32(&len);
300  E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
301  len, flen);
302  return -1;
303  }
304  /* Set the input endianness to the opposite of the machine endianness... */
305  cmd_ln_set_str_r(wtf->config, "-input_endian",
306  (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
307  ? "little" : "big"));
308  }
309 
310  fseek(fh, 4, SEEK_SET);
311  wtf->infh = fh;
312  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
313  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
314  }
315  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
316  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
317  wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
318  }
319  else {
320  /* Should not happen. */
321  E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
322  assert(FALSE);
323  }
324 
325  return TRUE;
326 }
327 
328 int
329 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
330 {
331  int i, j;
332 
333  if (whichchan > 0) {
334  for (i = whichchan - 1; i < nsamp; i += nchans)
335  buf[i/nchans] = buf[i];
336  }
337  else {
338  for (i = 0; i < nsamp; i += nchans) {
339  float64 tmp = 0.0;
340  for (j = 0; j < nchans && i + j < nsamp; ++j) {
341  tmp += buf[i + j];
342  }
343  buf[i/nchans] = (int16)(tmp / nchans);
344  }
345  }
346  return i/nchans;
347 }
348 
353 static int
354 decode_pcm(sphinx_wave2feat_t *wtf)
355 {
356  size_t nsamp;
357  int32 n, nfr, nchans, whichchan;
358  uint32 nfloat;
359 
360  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
361  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
362  fe_start_stream(wtf->fe);
363  fe_start_utt(wtf->fe);
364  nfloat = 0;
365  while ((nsamp = fread(wtf->audio, sizeof(int16), wtf->blocksize, wtf->infh)) != 0) {
366  size_t nvec;
367  int16 const *inspeech;
368 
369  /* Byteswap stuff here if necessary. */
370  if (wtf->byteswap) {
371  for (n = 0; n < nsamp; ++n)
372  SWAP_INT16(wtf->audio + n);
373  }
374 
375  /* Mix or pick channels. */
376  if (nchans > 1)
377  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
378 
379  inspeech = wtf->audio;
380  nvec = wtf->featsize;
381  /* Consume all samples. */
382  while (nsamp) {
383  nfr = nvec;
384  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr, NULL);
385  if (nfr) {
386  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
387  return -1;
388  nfloat += n;
389  }
390  }
391  inspeech = wtf->audio;
392  }
393  /* Now process any leftover audio frames. */
394  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
395  if (nfr) {
396  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
397  return -1;
398  nfloat += n;
399  }
400 
401  if (fclose(wtf->infh) == EOF)
402  E_ERROR_SYSTEM("Failed to close input file");
403  wtf->infh = NULL;
404  return nfloat;
405 }
406 
411 static int
412 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
413 {
414  int nfloat = 0, n;
415  int featsize = wtf->featsize;
416 
417  /* If the input vector length is less than the output length, we
418  * need to do this one frame at a time, because there's empty
419  * space at the end of each vector in wtf->feat. */
420  if (wtf->in_veclen < wtf->veclen)
421  featsize = 1;
422  while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
423  featsize * wtf->in_veclen, wtf->infh)) != 0) {
424  int i, nfr = n / wtf->in_veclen;
425  if (n % wtf->in_veclen) {
426  E_ERROR("Size of file %d not a multiple of veclen %d\n",
427  n, wtf->in_veclen);
428  return -1;
429  }
430  /* Byteswap stuff here if necessary. */
431  if (wtf->byteswap) {
432  for (i = 0; i < n; ++i)
433  SWAP_FLOAT32(wtf->feat[0] + i);
434  }
435  fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
436  for (i = 0; i < nfr; ++i) {
437  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
438  if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
439  fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
440  else
441  fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
442  }
443  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
444  fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
445  }
446  }
447  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
448  return -1;
449  nfloat += n;
450  }
451 
452  if (fclose(wtf->infh) == EOF)
453  E_ERROR_SYSTEM("Failed to close input file");
454  wtf->infh = NULL;
455  return nfloat;
456 }
457 
458 static const audio_type_t types[] = {
459  { "-mswav", &detect_riff, &decode_pcm },
460  { "-nist", &detect_nist, &decode_pcm },
461  { "-raw", &detect_raw, &decode_pcm },
462  { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
463 };
464 static const int ntypes = sizeof(types)/sizeof(types[0]);
465 static const audio_type_t mfcc_type = {
466  "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
467 };
468 
474 static int
475 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
476 {
477  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
478  E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
479  return -1;
480  }
481  return 0;
482 }
483 
489 static int
490 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
491 {
492  int i, nfloat = 0;
493 
494  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
495  for (i = 0; i < nfr; ++i) {
496  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
497  E_ERROR_SYSTEM("Writing %d values to %s failed",
498  wtf->veclen, wtf->outfile);
499  return -1;
500  }
501  nfloat += wtf->veclen;
502  }
503  return nfloat;
504 }
505 
506 typedef enum htk_feature_kind_e {
507  WAVEFORM = 0, /* PCM audio (rarely used) */
508  LPC = 1, /* LPC filter coefficients */
509  LPCREFC = 2, /* LPC reflection coefficients */
510  LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
511  LPCDELCEP = 4, /* LPCC plus deltas */
512  IREFC = 5, /* 16-bit integer LPC reflection coefficients */
513  MFCC = 6, /* MFCCs */
514  FBANK = 7, /* Log mel spectrum */
515  MELSPEC = 8, /* Linear mel spectrum */
516  USER = 9, /* User defined */
517  DISCRETE = 10, /* Vector quantized data */
518  PLP = 11 /* PLP coefficients */
519 } htk_feature_kind_t;
520 
521 typedef enum htk_feature_flag_e {
522  _E = 0000100, /* has energy */
523  _N = 0000200, /* absolute energy supressed */
524  _D = 0000400, /* has delta coefficients */
525  _A = 0001000, /* has acceleration (delta-delta) coefficients */
526  _C = 0002000, /* is compressed */
527  _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
528  _K = 0010000, /* has CRC checksum */
529  _O = 0020000, /* has 0th cepstral coefficient */
530  _V = 0040000, /* has VQ data */
531  _T = 0100000 /* has third differential coefficients */
532 } htk_feature_flag_t;
533 
537 static int
538 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
539 {
540  int32 samp_period;
541  int16 samp_size;
542  int16 param_kind;
543  int swap = FALSE;
544 
545  /* HTK files are big-endian. */
546  if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
547  swap = TRUE;
548  /* Same file size thing as in Sphinx files (I think) */
549  if (swap) SWAP_INT32(&nfloat);
550  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
551  return -1;
552  /* Sample period in 100ns units. */
553  samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
554  if (swap) SWAP_INT32(&samp_period);
555  if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
556  return -1;
557  /* Sample size - veclen * sizeof each sample. */
558  samp_size = wtf->veclen * 4;
559  if (swap) SWAP_INT16(&samp_size);
560  if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
561  return -1;
562  /* Format and flags. */
563  if (cmd_ln_boolean_r(wtf->config, "-logspec")
564  || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
565  param_kind = FBANK; /* log mel-filter bank outputs */
566  else
567  param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
568  if (swap) SWAP_INT16(&param_kind);
569  if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
570  return -1;
571 
572  return 0;
573 }
574 
578 static int
579 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
580 {
581  int i, j, swap, htk_reorder, nfloat = 0;
582 
583  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
584  /* This is possibly inefficient, but probably not a big deal. */
585  swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
586  htk_reorder = (0 == strcmp("htk", wtf->ot->name)
587  && !(cmd_ln_boolean_r(wtf->config, "-logspec")
588  || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
589  for (i = 0; i < nfr; ++i) {
590  if (htk_reorder) {
591  mfcc_t c0 = frames[i][0];
592  memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
593  frames[i][wtf->veclen - 1] = c0;
594  }
595  if (swap)
596  for (j = 0; j < wtf->veclen; ++j)
597  SWAP_FLOAT32(frames[i] + j);
598  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
599  E_ERROR_SYSTEM("Writing %d values to %s failed",
600  wtf->veclen, wtf->outfile);
601  return -1;
602  }
603  nfloat += wtf->veclen;
604  }
605  return nfloat;
606 }
607 
611 static int
612 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
613 {
614  int i, j, nfloat = 0;
615 
616  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
617  for (i = 0; i < nfr; ++i) {
618  for (j = 0; j < wtf->veclen; ++j) {
619  fprintf(wtf->outfh, "%.5g", MFCC2FLOAT(frames[i][j]));
620  if (j == wtf->veclen - 1)
621  fprintf(wtf->outfh, "\n");
622  else
623  fprintf(wtf->outfh, " ");
624  }
625  nfloat += wtf->veclen;
626  }
627  return nfloat;
628 }
629 
630 static const output_type_t outtypes[] = {
631  { "sphinx", &output_header_sphinx, &output_frames_sphinx },
632  { "htk", &output_header_htk, &output_frames_htk },
633  { "text", NULL, &output_frames_text }
634 };
635 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
636 
638 sphinx_wave2feat_init(cmd_ln_t *config)
639 {
640  sphinx_wave2feat_t *wtf;
641  int i;
642 
643  wtf = (sphinx_wave2feat_t *)ckd_calloc(1, sizeof(*wtf));
644  wtf->refcount = 1;
645  wtf->config = cmd_ln_retain(config);
646  wtf->fe = fe_init_auto_r(wtf->config);
647  if (!wtf->fe) {
648  E_FATAL("Failed to create feature extraction\n");
649  }
650 
651  wtf->ot = outtypes; /* Default (sphinx) type. */
652  for (i = 0; i < nouttypes; ++i) {
653  output_type_t const *otype = &outtypes[i];
654  if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
655  wtf->ot = otype;
656  break;
657  }
658  }
659  if (i == nouttypes) {
660  E_ERROR("Unknown output type: '%s'\n",
661  cmd_ln_str_r(config, "-ofmt"));
662  sphinx_wave2feat_free(wtf);
663  return NULL;
664  }
665 
666  return wtf;
667 }
668 
669 int
670 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
671 {
672  if (wtf == NULL)
673  return 0;
674  if (--wtf->refcount > 0)
675  return wtf->refcount;
676 
677  if (wtf->audio)
678  ckd_free(wtf->audio);
679  if (wtf->feat)
680  ckd_free_2d(wtf->feat);
681  if (wtf->infile)
682  ckd_free(wtf->infile);
683  if (wtf->outfile)
684  ckd_free(wtf->outfile);
685  if (wtf->infh) {
686  if (fclose(wtf->infh) == EOF)
687  E_ERROR_SYSTEM("Failed to close input file");
688  }
689  if (wtf->outfh) {
690  if (fclose(wtf->outfh) == EOF)
691  E_ERROR_SYSTEM("Failed to close output file");
692  }
693  cmd_ln_free_r(wtf->config);
694  fe_free(wtf->fe);
695  ckd_free(wtf);
696 
697  return 0;
698 }
699 
701 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
702 {
703  ++wtf->refcount;
704  return wtf;
705 }
706 
707 static audio_type_t const *
708 detect_audio_type(sphinx_wave2feat_t *wtf)
709 {
710  audio_type_t const *atype = NULL;
711  int i;
712 
713  /* Special case audio type for Sphinx MFCC inputs. */
714  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
715  || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
716  int rv = mfcc_type.detect(wtf);
717  if (rv == -1)
718  goto error_out;
719  return &mfcc_type;
720  }
721 
722  /* Try to use the type of infile given on the command line. */
723  for (i = 0; i < ntypes; ++i) {
724  int rv;
725  atype = &types[i];
726  if (cmd_ln_boolean_r(wtf->config, atype->name)) {
727  rv = (*atype->detect)(wtf);
728  if (rv == -1)
729  goto error_out;
730  else if (rv == TRUE)
731  break;
732  }
733  }
734  if (i == ntypes) {
735  /* Detect file type of infile and get parameters. */
736  for (i = 0; i < ntypes; ++i) {
737  int rv;
738  atype = &types[i];
739  rv = (*atype->detect)(wtf);
740  if (rv == -1)
741  goto error_out;
742  else if (rv == TRUE)
743  break;
744  }
745  if (i == ntypes)
746  goto error_out;
747  }
748  return atype;
749  error_out:
750  if (wtf->infh)
751  fclose(wtf->infh);
752  wtf->infh = NULL;
753  return NULL;
754 }
755 
756 int
757 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
758  char const *infile, char const *outfile)
759 {
760  int nchans, minfft, nfft, nfloat, veclen;
761  audio_type_t const *atype = NULL;
762  int fshift, fsize;
763 
764  E_INFO("Converting %s to %s\n", infile, outfile);
765 
766  wtf->infile = ckd_salloc(infile);
767 
768  /* Detect input file type. */
769  if ((atype = detect_audio_type(wtf)) == NULL)
770  return -1;
771 
772  /* Determine whether to byteswap input. */
773  wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
774  cmd_ln_str_r(wtf->config, "-input_endian"));
775 
776  /* Make sure the FFT size is sufficiently large. */
777  minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
778  * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
779  for (nfft = 1; nfft < minfft; nfft <<= 1)
780  ;
781  if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
782  E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
783  cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
784  cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
785  fe_free(wtf->fe);
786  wtf->fe = fe_init_auto_r(wtf->config);
787  }
788 
789  /* Get the output frame size (if not already set). */
790  if (wtf->veclen == 0)
791  wtf->veclen = fe_get_output_size(wtf->fe);
792 
793  /* Set up the input and output buffers. */
794  fe_get_input_size(wtf->fe, &fshift, &fsize);
795  /* Want to get at least a whole frame plus shift in here. Also we
796  will either pick or mix multiple channels so we need to read
797  them all at once. */
798  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
799  wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
800  if (wtf->blocksize < (fsize + fshift) * nchans) {
801  E_INFO("Block size of %d too small, increasing to %d\n",
802  wtf->blocksize,
803  (fsize + fshift) * nchans);
804  wtf->blocksize = (fsize + fshift) * nchans;
805  }
806  wtf->audio = (short *)ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
807  wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
808 
809  /* Use the maximum of the input and output frame sizes to allocate this. */
810  veclen = wtf->veclen;
811  if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
812 
813  wtf->feat = (mfcc_t**)ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
814 
815  /* Let's go! */
816  if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
817  E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
818  return -1;
819  }
820  /* Write an empty header, which we'll fill in later. */
821  if (wtf->ot->output_header &&
822  (*wtf->ot->output_header)(wtf, 0) < 0) {
823  E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
824  goto error_out;
825  }
826  wtf->outfile = ckd_salloc(outfile);
827 
828  if ((nfloat = (*atype->decode)(wtf)) < 0) {
829  E_ERROR("Failed to convert");
830  goto error_out;
831  }
832 
833  if (wtf->ot->output_header) {
834  if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
835  E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
836  goto error_out;
837  }
838  if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
839  E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
840  goto error_out;
841  }
842  }
843 
844 
845  if (wtf->audio)
846  ckd_free(wtf->audio);
847  if (wtf->feat)
848  ckd_free_2d(wtf->feat);
849  if (wtf->infile)
850  ckd_free(wtf->infile);
851  if (wtf->outfile)
852  ckd_free(wtf->outfile);
853 
854  wtf->audio = NULL;
855  wtf->infile = NULL;
856  wtf->feat = NULL;
857  wtf->outfile = NULL;
858 
859  if (wtf->outfh)
860  if (fclose(wtf->outfh) == EOF)
861  E_ERROR_SYSTEM("Failed to close output file");
862  wtf->outfh = NULL;
863 
864  return 0;
865 
866 error_out:
867 
868  if (wtf->audio)
869  ckd_free(wtf->audio);
870  if (wtf->feat)
871  ckd_free_2d(wtf->feat);
872  if (wtf->infile)
873  ckd_free(wtf->infile);
874  if (wtf->outfile)
875  ckd_free(wtf->outfile);
876 
877  wtf->audio = NULL;
878  wtf->infile = NULL;
879  wtf->feat = NULL;
880  wtf->outfile = NULL;
881 
882  if (wtf->outfh)
883  if (fclose(wtf->outfh) == EOF)
884  E_ERROR_SYSTEM("Failed to close output file");
885  wtf->outfh = NULL;
886 
887  return -1;
888 }
889 
890 void
891 build_filenames(cmd_ln_t *config, char const *basename,
892  char **out_infile, char **out_outfile)
893 {
894  char const *di, *do_, *ei, *eo;
895 
896  di = cmd_ln_str_r(config, "-di");
897  do_ = cmd_ln_str_r(config, "-do");
898  ei = cmd_ln_str_r(config, "-ei");
899  eo = cmd_ln_str_r(config, "-eo");
900 
901  *out_infile = string_join(di ? di : "",
902  di ? "/" : "",
903  basename,
904  ei ? "." : "",
905  ei ? ei : "",
906  NULL);
907  *out_outfile = string_join(do_ ? do_ : "",
908  do_ ? "/" : "",
909  basename,
910  eo ? "." : "",
911  eo ? eo : "",
912  NULL);
913  /* Build output directory structure if possible/requested (it is
914  * by default). */
915  if (cmd_ln_boolean_r(config, "-build_outdirs")) {
916  char *dirname = ckd_salloc(*out_outfile);
917  path2dirname(*out_outfile, dirname);
918  build_directory(dirname);
919  ckd_free(dirname);
920  }
921 }
922 
923 static int
924 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
925 {
926  hash_table_t *files;
927  hash_iter_t *itor;
928  lineiter_t *li;
929  FILE *ctlfh;
930  int nskip, runlen, npart;
931 
932  if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
933  E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
934  return -1;
935  }
936  nskip = cmd_ln_int32_r(wtf->config, "-nskip");
937  runlen = cmd_ln_int32_r(wtf->config, "-runlen");
938  if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
939  /* Count lines in the file. */
940  int partlen, part, nlines = 0;
941  part = cmd_ln_int32_r(wtf->config, "-part");
942  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
943  ++nlines;
944  fseek(ctlfh, 0, SEEK_SET);
945  partlen = nlines / npart;
946  nskip = partlen * (part - 1);
947  if (part == npart)
948  runlen = -1;
949  else
950  runlen = partlen;
951  }
952  if (runlen != -1){
953  E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
954  files = hash_table_new(runlen, HASH_CASE_YES);
955  }
956  else {
957  E_INFO("Processing all remaining utterances at position %d\n", nskip);
958  files = hash_table_new(1000, HASH_CASE_YES);
959  }
960  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
961  char *c, *infile, *outfile;
962 
963  if (nskip-- > 0)
964  continue;
965  if (runlen == 0) {
966  lineiter_free(li);
967  break;
968  }
969  --runlen;
970 
971  string_trim(li->buf, STRING_BOTH);
972  /* Extract the file ID from the control line. */
973  if ((c = strchr(li->buf, ' ')) != NULL)
974  *c = '\0';
975  if (strlen(li->buf) == 0) {
976  E_WARN("Empty line %d in control file, skipping\n", li->lineno);
977  continue;
978  }
979  build_filenames(wtf->config, li->buf, &infile, &outfile);
980  if (hash_table_lookup(files, infile, NULL) == 0)
981  continue;
982  sphinx_wave2feat_convert_file(wtf, infile, outfile);
983  hash_table_enter(files, infile, outfile);
984  }
985  for (itor = hash_table_iter(files); itor;
986  itor = hash_table_iter_next(itor)) {
987  ckd_free((void *)hash_entry_key(itor->ent));
988  ckd_free(hash_entry_val(itor->ent));
989  }
990  hash_table_free(files);
991  fclose(ctlfh);
992 
993  return 0;
994 }
995 
996 int
997 main(int argc, char *argv[])
998 {
999  sphinx_wave2feat_t *wtf;
1000  cmd_ln_t *config;
1001  int rv;
1002 
1003  config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE);
1004 
1005  if (config && cmd_ln_str_r(config, "-argfile"))
1006  config = cmd_ln_parse_file_r(config, defn,
1007  cmd_ln_str_r(config, "-argfile"), FALSE);
1008  if (config == NULL) {
1009  E_ERROR("Command line parsing failed\n");
1010  return 1;
1011  }
1012 
1013  if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1014  E_ERROR("Failed to initialize wave2feat object\n");
1015  return 1;
1016  }
1017 
1018  /* If there's a control file run through it, otherwise we will do
1019  * a single file (which is what run_control_file will do
1020  * internally too) */
1021  if (cmd_ln_str_r(config, "-c"))
1022  rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1023  else
1024  rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1025  cmd_ln_str_r(config, "-o"));
1026 
1027  sphinx_wave2feat_free(wtf);
1028  cmd_ln_free_r(config);
1029  return rv;
1030 }
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition: err.h:99
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition: cmd_ln.c:1025
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
hash_entry_t * ent
Current entry in that table.
Definition: hash_table.h:170
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:309
int veclen
Length of each output vector.
Definition: sphinx_fe.c:84
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
output_type_t const * ot
Output type object.
Definition: sphinx_fe.c:87
Sphinx&#39;s memory allocation/deallocation routines.
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1032
File names related operation.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:553
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition: hash_table.c:653
Line iterator for files.
Definition: pio.h:177
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
#define hash_entry_val(e)
Access macros.
Definition: hash_table.h:175
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:945
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
FILE * infh
Input file handle.
Definition: sphinx_fe.c:78
int refcount
Reference count.
Definition: sphinx_fe.c:73
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
char * outfile
Path to output file.
Definition: sphinx_fe.c:77
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:695
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition: pio.c:620
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
int featsize
Size of feature buffer.
Definition: sphinx_fe.c:83
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:367
RIFF 44-byte header structure for MS wav files.
Definition: sphinx_fe.c:91
FILE * outfh
Output file handle.
Definition: sphinx_fe.c:79
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
int byteswap
Whether byteswapping is necessary.
Definition: sphinx_fe.c:86
mfcc_t ** feat
Feature buffer.
Definition: sphinx_fe.c:81
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:264
int in_veclen
Length of each input vector (for cep<->spec).
Definition: sphinx_fe.c:85
Implementation of logging routines.
Both ends of string.
Definition: strfuncs.h:73
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:508
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:761
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
short * audio
Audio buffer.
Definition: sphinx_fe.c:80
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
Opaque structure used to hold the results of command-line parsing.
char * infile
Path to input file.
Definition: sphinx_fe.c:76
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition: hash_table.c:663
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string...
Definition: strfuncs.c:70
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
int blocksize
Size of audio buffer.
Definition: sphinx_fe.c:82
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:74
Hash table implementation.
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
Structure for the front-end computation.
Definition: fe_internal.h:117
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition: cmd_ln.c:985
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:97
fe_t * fe
Front end object.
Definition: sphinx_fe.c:75
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition: filename.c:68
file IO related operations.