cmusphinx · dhdaines · Sep 27, 2022 · Sep 19, 2022 · Sep 20, 2022 · Sep 20, 2022
diff --git a/README.md b/README.md
@@ -56,7 +56,8 @@ The `pocketsphinx` command-line program reads single-channel 16-bit
 PCM audio from standard input or one or more files, and attemps to
 recognize speech in it using the default acoustic and language model.
 It accepts a large number of options which you probably don't care
-about, and a *command* which defaults to `live`.
+about, a *command* which defaults to `live`, and one or more inputs
+(except in `align` mode), or `-` to read from standard input.
 
 If you have a single-channel WAV file called "speech.wav" and you want
 to recognize speech in it, you can try doing this (the results may not
@@ -86,29 +87,56 @@ The commands are as follows:
     - `t`: Full text of recognition result
     - `w`: List of segments (usually words), each of which in turn
       contains the `b`, `d`, `p`, and `t` fields, for start, end,
-      probability, and the text of the word.  In the future we may
-      also support hierarchical results in which case `w` could be
-      present.
+      probability, and the text of the word.  If `-phone_align yes`
+      has been passed, then a `w` field will be present containing
+      phone segmentations, in the same format.
 
   - `single`: Recognize each input as a single utterance, and write a
     JSON object in the same format described above.
+
+  - `align`: Align a single input file (or `-` for standard input) to
+    a word sequence, and write a JSON object in the same format
+    described above.  The first positional argument is the input, and
+    all subsequent ones are concatenated to make the text, to avoid
+    surprises if you forget to quote it.  You are responsible for
+    normalizing the text to remove punctuation, uppercase, centipedes,
+    etc. For example:
+
+        pocketsphinx align goforward.wav "go forward ten meters"
+
+    By default, only word-level alignment is done.  To get phone
+    alignments, pass `-phone_align yes` in the flags, e.g.:
+
+        pocketsphinx -phone_align yes align audio.wav $text
+
+    This will make not particularly readable output, but you can use
+    [jq](https://stedolan.github.io/jq/) to clean it up.  For example,
+    you can get just the word names and start times like this:
+
+        pocketsphinx align audio.wav $text | jq '.w[]|[.t,.b]'
+
+    Or you could get the phone names and durations like this:
+
+        pocketsphinx -phone_align yes align audio.wav $text | jq '.w[]|.w[]|[.t,.d]'
+
+    There are many, many other possibilities, of course.
 
   - `soxflags`: Return arguments to `sox` which will create the
     appropriate input format.  Note that because the `sox`
     command-line is slightly quirky these must always come *after* the
     filename or `-d` (which tells `sox` to read from the microphone).
     You can run live recognition like this:
 
-        sox -d $(pocketsphinx soxflags) | pocketsphinx
+        sox -d $(pocketsphinx soxflags) | pocketsphinx -
 
     or decode from a file named "audio.mp3" like this:
 
-        sox audio.mp3 $(pocketsphinx soxflags) | pocketsphinx
+        sox audio.mp3 $(pocketsphinx soxflags) | pocketsphinx -
 
 By default only errors are printed to standard error, but if you want
 more information you can pass `-loglevel INFO`.  Partial results are
 not printed, maybe they will be in the future, but don't hold your
-breath.  Force-alignment is likely to be supported soon, however.
+breath.
 
 Programming
 -----------

diff --git a/cython/_pocketsphinx.pxd b/cython/_pocketsphinx.pxd
@@ -422,7 +422,9 @@ cdef extern from "pocketsphinx/search.h":
     int ps_add_keyphrase(ps_decoder_t *ps, const char *name, const char *keyphrase)
     int ps_add_allphone(ps_decoder_t *ps, const char *name, ngram_model_t *lm)
     int ps_add_allphone_file(ps_decoder_t *ps, const char *name, const char *path)
-    int ps_add_align(ps_decoder_t *ps, const char *name, const char *words)
+    int ps_set_align_text(ps_decoder_t *ps, const char *words)
+    int ps_set_alignment(ps_decoder_t *ps, ps_alignment_t *al)
+    ps_alignment_t *ps_get_alignment(ps_decoder_t *ps)
 
 cdef extern from "pocketsphinx/vad.h":
     ctypedef struct ps_vad_t:
@@ -472,3 +474,37 @@ cdef extern from "pocketsphinx/endpointer.h":
     int ps_endpointer_in_speech(ps_endpointer_t *ep)
     double ps_endpointer_speech_start(ps_endpointer_t *ep)
     double ps_endpointer_speech_end(ps_endpointer_t *ep)
+
+cdef extern from "pocketsphinx/alignment.h":
+    ctypedef struct ps_alignment_t:
+        pass
+    ctypedef struct ps_alignment_iter_t:
+        pass
+    ctypedef struct pid_struct:
+        short cipid
+        unsigned short ssid
+        int tmat
+    ctypedef union id_union:
+        int wid
+        pid_struct pid
+        unsigned short senid
+    ctypedef struct ps_alignment_entry_t:
+        int start
+        int duration
+        int score
+        id_union id
+        int parent
+        int child
+    ps_alignment_t *ps_alignment_retain(ps_alignment_t *al)
+    int ps_alignment_free(ps_alignment_t *al)
+    int ps_alignment_n_words(ps_alignment_t *al)
+    int ps_alignment_n_phones(ps_alignment_t *al)
+    int ps_alignment_n_states(ps_alignment_t *al)
+    ps_alignment_iter_t *ps_alignment_words(ps_alignment_t *al)
+    ps_alignment_iter_t *ps_alignment_phones(ps_alignment_t *al)
+    ps_alignment_iter_t *ps_alignment_states(ps_alignment_t *al)
+    ps_alignment_iter_t *ps_alignment_iter_next(ps_alignment_iter_t *itor)
+    ps_alignment_iter_t *ps_alignment_iter_children(ps_alignment_iter_t *itor)
+    int ps_alignment_iter_seg(ps_alignment_iter_t *itor, int *start, int *duration)
+    const char *ps_alignment_iter_name(ps_alignment_iter_t *itor)
+    int ps_alignment_iter_free(ps_alignment_iter_t *itor)
diff --git a/cython/_pocketsphinx.pyx b/cython/_pocketsphinx.pyx
@@ -1602,6 +1602,100 @@ cdef class Decoder:
                       DeprecationWarning)
         return self.current_search()
 
+    def set_align_text(self, text):
+        """Set a word sequence for alignment *and* enable alignment mode.
+
+        Unlike the `add_*` methods and the deprecated, badly-named
+        `set_*` methods, this really does immediately enable the
+        resulting search module.  This is because alignment is
+        typically a one-shot deal, i.e. you are not likely to create a
+        list of different alignments and keep them around.  If you
+        really want to do that, perhaps you should use FSG search
+        instead.  Or let me know and perhaps I'll add an
+        `add_align_text` method.
+
+        You must do any text normalization yourself.  For word-level
+        alignment, once you call this, simply decode and get the
+        segmentation in the usual manner.  For phone-level alignment,
+        see `set_alignment` and `get_alignment`.
+
+        Args:
+            text(str): Sentence to align, as whitespace-separated
+                       words.  All words must be present in the
+                       dictionary.
+        Raises:
+            RuntimeError: If text is invalid somehow.
+        """
+        cdef int rv = ps_set_align_text(self._ps, text.encode("utf-8"))
+        if rv < 0:
+            raise RuntimeError("Failed to set up alignment of %s" % (text))
+
+    def set_alignment(self, Alignment alignment = None):
+        """Set up *and* activate sub-word alignment mode.
+
+        For efficiency reasons, decoding and word-level alignment (as
+        done by `set_align_text`) do not track alignments at the
+        sub-word level.  This is fine for a lot of use cases, but
+        obviously not all of them.  If you want to obtain phone or
+        state level alignments, you must run a second pass of
+        alignment, which is what this function sets you up to do.  The
+        sequence is something like this:
+
+            decoder.set_align_text("hello world")
+            decoder.start_utt()
+            decoder.process_raw(data, full_utt=True)
+            decoder.end_utt()
+            decoder.set_alignment()
+            decoder.start_utt()
+            decoder.process_raw(data, full_utt=True)
+            decoder.end_utt()
+            for word in decoder.get_alignment():
+                for phone in word:
+                    for state in phone:
+                        print(word, phone, state)
+
+        That's a lot of code, so it may get simplified, either here or
+        in a derived class, before release.
+
+        Note that if you are using this with N-Gram or FSG decoding,
+        you can restore the default search module afterwards by
+        calling activate_search() with no argument.
+
+        Args:
+            alignment(Alignment): Pre-constructed `Alignment` object.
+                  Currently you can't actually do anything with this.
+        Raises:
+            RuntimeError: If current hypothesis cannot be aligned (such
+                          as when using keyphrase or allphone search).
+
+        """
+        cdef int rv
+        if alignment is not None:
+            rv = ps_set_alignment(self._ps, alignment._al)
+        else:
+            rv = ps_set_alignment(self._ps, NULL)
+        if rv < 0:
+            raise RuntimeError("Failed to set up sub-word alignment")
+
+    def get_alignment(self):
+        """Get the current sub-word alignment, if any.
+
+        This will return something if `ps_set_alignment` has been
+        called, but it will not contain an actual *alignment*
+        (i.e. phone and state durations) unless a second pass of
+        decoding has been run.
+
+        If the decoder is not in sub-word alignment mode then it will
+        return None.
+
+        Returns:
+            Alignment - if an alignment exists.
+        """
+        cdef ps_alignment_t *al = ps_get_alignment(self._ps)
+        if al == NULL:
+            return None
+        return Alignment.create_from_ptr(ps_alignment_retain(al))
+
     def n_frames(self):
         """Get the number of frames processed up to this point.
 
@@ -1814,6 +1908,75 @@ cdef class Endpointer:
             return None
         return (<const unsigned char *>&outbuf[0])[:out_n_samples * 2]
 
+cdef class AlignmentEntry:
+    cdef public int start
+    cdef public int duration
+    cdef public int score
+    cdef public str name
+    # DANGER! Not retained!
+    cdef ps_alignment_iter_t *itor
+    @staticmethod
+    cdef create_from_iter(ps_alignment_iter_t *itor):
+        cdef AlignmentEntry self
+        self = AlignmentEntry.__new__(AlignmentEntry)
+        self.score = ps_alignment_iter_seg(itor, &self.start, &self.duration)
+        self.name = ps_alignment_iter_name(itor).decode('utf-8')
+        self.itor = itor  # DANGER! DANGER!
+        return self
+
+    def __iter__(self):
+        cdef ps_alignment_iter_t *itor = ps_alignment_iter_children(self.itor)
+        while itor != NULL:
+            c = AlignmentEntry.create_from_iter(itor)
+            yield c
+            itor = ps_alignment_iter_next(itor)
+        # FIXME: will leak memory if iteration stopped short!
+
+cdef class Alignment:
+    """Sub-word alignment alignment.
+
+    For the moment this is read-only.
+    """
+    cdef ps_alignment_t *_al
+
+    @staticmethod
+    cdef create_from_ptr(ps_alignment_t *al):
+        cdef Alignment self = Alignment.__new__(Alignment)
+        self._al = al
+        return self
+
+    def __dealloc__(self):
+        if self._al != NULL:
+            ps_alignment_free(self._al)
+
+    def __iter__(self):
+        return self.words()
+
+    def words(self):
+        """Iterate over words in the alignment."""
+        cdef ps_alignment_iter_t *itor = ps_alignment_words(self._al)
+        while itor != NULL:
+            w = AlignmentEntry.create_from_iter(itor)
+            yield w
+            itor = ps_alignment_iter_next(itor)
+        # FIXME: will leak memory if iteration stopped short!
+
+    def phones(self):
+        """Iterate over phones in the alignment."""
+        cdef ps_alignment_iter_t *itor = ps_alignment_phones(self._al)
+        while itor != NULL:
+            p = AlignmentEntry.create_from_iter(itor)
+            yield p
+            itor = ps_alignment_iter_next(itor)
+
+    def states(self):
+        """Iterate over states in the alignment."""
+        cdef ps_alignment_iter_t *itor = ps_alignment_states(self._al)
+        while itor != NULL:
+            s = AlignmentEntry.create_from_iter(itor)
+            yield s
+            itor = ps_alignment_iter_next(itor)
+
 def set_loglevel(level):
     """Set internal log level of PocketSphinx.
 

diff --git a/cython/test/alignment_test.py b/cython/test/alignment_test.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+
+import os
+from pocketsphinx import Decoder
+import unittest
+
+DATADIR = os.path.join(os.path.dirname(__file__), "../../test/data")
+
+
+class TestAlignment(unittest.TestCase):
+    def _run_decode(self, decoder, expect_fail=False):
+        with open(os.path.join(DATADIR, "goforward.raw"), "rb") as fh:
+            buf = fh.read()
+            decoder.start_utt()
+            decoder.process_raw(buf, no_search=False, full_utt=True)
+            decoder.end_utt()
+
+    def test_alignment(self):
+        decoder = Decoder(lm=None)
+        decoder.set_align_text("go forward ten meters")
+        self._run_decode(decoder)
+        words = []
+        for seg in decoder.seg():
+            if seg.word not in ("<s>", "</s>", "<sil>", "(NULL)"):
+                words.append((seg.word, seg.start_frame, seg.end_frame))
+        print(words)
+        decoder.set_alignment()
+        self._run_decode(decoder)
+        for word in decoder.get_alignment():
+            print(word.start, word.duration, word.score, word.name)
+            for phone in word:
+                print("\t", phone.start, phone.duration, phone.score, phone.name)
+                for state in phone:
+                    print("\t\t", state.start, state.duration, state.score, state.name)
+
+
+if __name__ == "__main__":
+    unittest.main()