Skip to content

Commit

Permalink
Merge pull request #300 from cmusphinx/enhanced_alignment
Browse files Browse the repository at this point in the history
New force-alignment API and two-pass alignment to get phone/state durations
  • Loading branch information
dhdaines authored Sep 27, 2022
2 parents 9e8f704 + 6dba1b2 commit 68c5db8
Show file tree
Hide file tree
Showing 46 changed files with 1,519 additions and 409 deletions.
42 changes: 35 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ The `pocketsphinx` command-line program reads single-channel 16-bit
PCM audio from standard input or one or more files, and attemps to
recognize speech in it using the default acoustic and language model.
It accepts a large number of options which you probably don't care
about, and a *command* which defaults to `live`.
about, a *command* which defaults to `live`, and one or more inputs
(except in `align` mode), or `-` to read from standard input.

If you have a single-channel WAV file called "speech.wav" and you want
to recognize speech in it, you can try doing this (the results may not
Expand Down Expand Up @@ -86,29 +87,56 @@ The commands are as follows:
- `t`: Full text of recognition result
- `w`: List of segments (usually words), each of which in turn
contains the `b`, `d`, `p`, and `t` fields, for start, end,
probability, and the text of the word. In the future we may
also support hierarchical results in which case `w` could be
present.
probability, and the text of the word. If `-phone_align yes`
has been passed, then a `w` field will be present containing
phone segmentations, in the same format.

- `single`: Recognize each input as a single utterance, and write a
JSON object in the same format described above.

- `align`: Align a single input file (or `-` for standard input) to
a word sequence, and write a JSON object in the same format
described above. The first positional argument is the input, and
all subsequent ones are concatenated to make the text, to avoid
surprises if you forget to quote it. You are responsible for
normalizing the text to remove punctuation, uppercase, centipedes,
etc. For example:

pocketsphinx align goforward.wav "go forward ten meters"

By default, only word-level alignment is done. To get phone
alignments, pass `-phone_align yes` in the flags, e.g.:

pocketsphinx -phone_align yes align audio.wav $text

This will make not particularly readable output, but you can use
[jq](https://stedolan.github.io/jq/) to clean it up. For example,
you can get just the word names and start times like this:

pocketsphinx align audio.wav $text | jq '.w[]|[.t,.b]'

Or you could get the phone names and durations like this:

pocketsphinx -phone_align yes align audio.wav $text | jq '.w[]|.w[]|[.t,.d]'

There are many, many other possibilities, of course.

- `soxflags`: Return arguments to `sox` which will create the
appropriate input format. Note that because the `sox`
command-line is slightly quirky these must always come *after* the
filename or `-d` (which tells `sox` to read from the microphone).
You can run live recognition like this:

sox -d $(pocketsphinx soxflags) | pocketsphinx
sox -d $(pocketsphinx soxflags) | pocketsphinx -

or decode from a file named "audio.mp3" like this:

sox audio.mp3 $(pocketsphinx soxflags) | pocketsphinx
sox audio.mp3 $(pocketsphinx soxflags) | pocketsphinx -

By default only errors are printed to standard error, but if you want
more information you can pass `-loglevel INFO`. Partial results are
not printed, maybe they will be in the future, but don't hold your
breath. Force-alignment is likely to be supported soon, however.
breath.

Programming
-----------
Expand Down
38 changes: 37 additions & 1 deletion cython/_pocketsphinx.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,9 @@ cdef extern from "pocketsphinx/search.h":
int ps_add_keyphrase(ps_decoder_t *ps, const char *name, const char *keyphrase)
int ps_add_allphone(ps_decoder_t *ps, const char *name, ngram_model_t *lm)
int ps_add_allphone_file(ps_decoder_t *ps, const char *name, const char *path)
int ps_add_align(ps_decoder_t *ps, const char *name, const char *words)
int ps_set_align_text(ps_decoder_t *ps, const char *words)
int ps_set_alignment(ps_decoder_t *ps, ps_alignment_t *al)
ps_alignment_t *ps_get_alignment(ps_decoder_t *ps)

cdef extern from "pocketsphinx/vad.h":
ctypedef struct ps_vad_t:
Expand Down Expand Up @@ -472,3 +474,37 @@ cdef extern from "pocketsphinx/endpointer.h":
int ps_endpointer_in_speech(ps_endpointer_t *ep)
double ps_endpointer_speech_start(ps_endpointer_t *ep)
double ps_endpointer_speech_end(ps_endpointer_t *ep)

cdef extern from "pocketsphinx/alignment.h":
ctypedef struct ps_alignment_t:
pass
ctypedef struct ps_alignment_iter_t:
pass
ctypedef struct pid_struct:
short cipid
unsigned short ssid
int tmat
ctypedef union id_union:
int wid
pid_struct pid
unsigned short senid
ctypedef struct ps_alignment_entry_t:
int start
int duration
int score
id_union id
int parent
int child
ps_alignment_t *ps_alignment_retain(ps_alignment_t *al)
int ps_alignment_free(ps_alignment_t *al)
int ps_alignment_n_words(ps_alignment_t *al)
int ps_alignment_n_phones(ps_alignment_t *al)
int ps_alignment_n_states(ps_alignment_t *al)
ps_alignment_iter_t *ps_alignment_words(ps_alignment_t *al)
ps_alignment_iter_t *ps_alignment_phones(ps_alignment_t *al)
ps_alignment_iter_t *ps_alignment_states(ps_alignment_t *al)
ps_alignment_iter_t *ps_alignment_iter_next(ps_alignment_iter_t *itor)
ps_alignment_iter_t *ps_alignment_iter_children(ps_alignment_iter_t *itor)
int ps_alignment_iter_seg(ps_alignment_iter_t *itor, int *start, int *duration)
const char *ps_alignment_iter_name(ps_alignment_iter_t *itor)
int ps_alignment_iter_free(ps_alignment_iter_t *itor)
163 changes: 163 additions & 0 deletions cython/_pocketsphinx.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1602,6 +1602,100 @@ cdef class Decoder:
DeprecationWarning)
return self.current_search()

def set_align_text(self, text):
"""Set a word sequence for alignment *and* enable alignment mode.
Unlike the `add_*` methods and the deprecated, badly-named
`set_*` methods, this really does immediately enable the
resulting search module. This is because alignment is
typically a one-shot deal, i.e. you are not likely to create a
list of different alignments and keep them around. If you
really want to do that, perhaps you should use FSG search
instead. Or let me know and perhaps I'll add an
`add_align_text` method.
You must do any text normalization yourself. For word-level
alignment, once you call this, simply decode and get the
segmentation in the usual manner. For phone-level alignment,
see `set_alignment` and `get_alignment`.
Args:
text(str): Sentence to align, as whitespace-separated
words. All words must be present in the
dictionary.
Raises:
RuntimeError: If text is invalid somehow.
"""
cdef int rv = ps_set_align_text(self._ps, text.encode("utf-8"))
if rv < 0:
raise RuntimeError("Failed to set up alignment of %s" % (text))

def set_alignment(self, Alignment alignment = None):
"""Set up *and* activate sub-word alignment mode.
For efficiency reasons, decoding and word-level alignment (as
done by `set_align_text`) do not track alignments at the
sub-word level. This is fine for a lot of use cases, but
obviously not all of them. If you want to obtain phone or
state level alignments, you must run a second pass of
alignment, which is what this function sets you up to do. The
sequence is something like this:
decoder.set_align_text("hello world")
decoder.start_utt()
decoder.process_raw(data, full_utt=True)
decoder.end_utt()
decoder.set_alignment()
decoder.start_utt()
decoder.process_raw(data, full_utt=True)
decoder.end_utt()
for word in decoder.get_alignment():
for phone in word:
for state in phone:
print(word, phone, state)
That's a lot of code, so it may get simplified, either here or
in a derived class, before release.
Note that if you are using this with N-Gram or FSG decoding,
you can restore the default search module afterwards by
calling activate_search() with no argument.
Args:
alignment(Alignment): Pre-constructed `Alignment` object.
Currently you can't actually do anything with this.
Raises:
RuntimeError: If current hypothesis cannot be aligned (such
as when using keyphrase or allphone search).
"""
cdef int rv
if alignment is not None:
rv = ps_set_alignment(self._ps, alignment._al)
else:
rv = ps_set_alignment(self._ps, NULL)
if rv < 0:
raise RuntimeError("Failed to set up sub-word alignment")

def get_alignment(self):
"""Get the current sub-word alignment, if any.
This will return something if `ps_set_alignment` has been
called, but it will not contain an actual *alignment*
(i.e. phone and state durations) unless a second pass of
decoding has been run.
If the decoder is not in sub-word alignment mode then it will
return None.
Returns:
Alignment - if an alignment exists.
"""
cdef ps_alignment_t *al = ps_get_alignment(self._ps)
if al == NULL:
return None
return Alignment.create_from_ptr(ps_alignment_retain(al))

def n_frames(self):
"""Get the number of frames processed up to this point.
Expand Down Expand Up @@ -1814,6 +1908,75 @@ cdef class Endpointer:
return None
return (<const unsigned char *>&outbuf[0])[:out_n_samples * 2]

cdef class AlignmentEntry:
cdef public int start
cdef public int duration
cdef public int score
cdef public str name
# DANGER! Not retained!
cdef ps_alignment_iter_t *itor
@staticmethod
cdef create_from_iter(ps_alignment_iter_t *itor):
cdef AlignmentEntry self
self = AlignmentEntry.__new__(AlignmentEntry)
self.score = ps_alignment_iter_seg(itor, &self.start, &self.duration)
self.name = ps_alignment_iter_name(itor).decode('utf-8')
self.itor = itor # DANGER! DANGER!
return self

def __iter__(self):
cdef ps_alignment_iter_t *itor = ps_alignment_iter_children(self.itor)
while itor != NULL:
c = AlignmentEntry.create_from_iter(itor)
yield c
itor = ps_alignment_iter_next(itor)
# FIXME: will leak memory if iteration stopped short!

cdef class Alignment:
"""Sub-word alignment alignment.
For the moment this is read-only.
"""
cdef ps_alignment_t *_al

@staticmethod
cdef create_from_ptr(ps_alignment_t *al):
cdef Alignment self = Alignment.__new__(Alignment)
self._al = al
return self

def __dealloc__(self):
if self._al != NULL:
ps_alignment_free(self._al)

def __iter__(self):
return self.words()

def words(self):
"""Iterate over words in the alignment."""
cdef ps_alignment_iter_t *itor = ps_alignment_words(self._al)
while itor != NULL:
w = AlignmentEntry.create_from_iter(itor)
yield w
itor = ps_alignment_iter_next(itor)
# FIXME: will leak memory if iteration stopped short!

def phones(self):
"""Iterate over phones in the alignment."""
cdef ps_alignment_iter_t *itor = ps_alignment_phones(self._al)
while itor != NULL:
p = AlignmentEntry.create_from_iter(itor)
yield p
itor = ps_alignment_iter_next(itor)

def states(self):
"""Iterate over states in the alignment."""
cdef ps_alignment_iter_t *itor = ps_alignment_states(self._al)
while itor != NULL:
s = AlignmentEntry.create_from_iter(itor)
yield s
itor = ps_alignment_iter_next(itor)

def set_loglevel(level):
"""Set internal log level of PocketSphinx.
Expand Down
38 changes: 38 additions & 0 deletions cython/test/alignment_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/python

import os
from pocketsphinx import Decoder
import unittest

DATADIR = os.path.join(os.path.dirname(__file__), "../../test/data")


class TestAlignment(unittest.TestCase):
def _run_decode(self, decoder, expect_fail=False):
with open(os.path.join(DATADIR, "goforward.raw"), "rb") as fh:
buf = fh.read()
decoder.start_utt()
decoder.process_raw(buf, no_search=False, full_utt=True)
decoder.end_utt()

def test_alignment(self):
decoder = Decoder(lm=None)
decoder.set_align_text("go forward ten meters")
self._run_decode(decoder)
words = []
for seg in decoder.seg():
if seg.word not in ("<s>", "</s>", "<sil>", "(NULL)"):
words.append((seg.word, seg.start_frame, seg.end_frame))
print(words)
decoder.set_alignment()
self._run_decode(decoder)
for word in decoder.get_alignment():
print(word.start, word.duration, word.score, word.name)
for phone in word:
print("\t", phone.start, phone.duration, phone.score, phone.name)
for state in phone:
print("\t\t", state.start, state.duration, state.score, state.name)


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 68c5db8

Please sign in to comment.