Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add testing setup #106

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ ext/.depend.mk
*.pyc
*~
webdata/
.cache/
1 change: 1 addition & 0 deletions gentle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from forced_aligner import ForcedAligner
from full_transcriber import FullTranscriber
from resample import resample, resampled
from transcription import Transcription
70 changes: 34 additions & 36 deletions gentle/diff_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import os
import sys

import metasentence
import language_model
import standard_kaldi
from resources import Resources
from gentle import metasentence
from gentle import language_model
from gentle import standard_kaldi
from gentle import transcription
from gentle.resources import Resources


# TODO(maxhawkins): try using the (apparently-superior) time-mediated dynamic
Expand All @@ -23,7 +24,7 @@ def align(alignment, ms, **kwargs):
disfluency = kwargs['disfluency'] if 'disfluency' in kwargs else False
disfluencies = kwargs['disfluencies'] if 'disfluencies' in kwargs else []

hypothesis = [X["word"] for X in alignment]
hypothesis = [X.word for X in alignment]
reference = ms.get_kaldi_sequence()

display_seq = ms.get_display_sequence()
Expand All @@ -36,17 +37,16 @@ def align(alignment, ms, **kwargs):
word = hypothesis[a]
if disfluency and word in disfluencies:
hyp_token = alignment[a]
phones = hyp_token.get("phones", [])
start = hyp_token["start"]
end = hyp_token["start"] + hyp_token["duration"]

out.append({
"case": "not-found-in-transcript",
"phones": phones,
"start": start,
"end": end,
"word": word
})
phones = hyp_token.phones or []
start = hyp_token.start
end = hyp_token.start + hyp_token.duration

out.append(transcription.Word(
case="not-found-in-transcript",
phones=phones,
start=start,
end=end,
word=word))
continue

display_word = display_seq[b]
Expand All @@ -55,28 +55,26 @@ def align(alignment, ms, **kwargs):
if op == 'equal':
hyp_word = hypothesis[a]
hyp_token = alignment[a]
phones = hyp_token.get("phones", [])
start = hyp_token["start"]
end = hyp_token["start"] + hyp_token["duration"]

out.append({
"case": "success",
"startOffset": start_offset,
"endOffset": end_offset,
"word": display_word,
"alignedWord": hyp_word,
"phones": phones,
"start": start,
"end": end,
})
phones = hyp_token.phones or []
start = hyp_token.start
end = hyp_token.start + hyp_token.duration

out.append(transcription.Word(
case="success",
startOffset=start_offset,
endOffset=end_offset,
word=display_word,
alignedWord=hyp_word,
phones=phones,
start=start,
end=end))

elif op in ['insert', 'replace']:
out.append({
"case": "not-found-in-audio",
"startOffset": start_offset,
"endOffset": end_offset,
"word": display_word,
})
out.append(transcription.Word(
case="not-found-in-audio",
startOffset=start_offset,
endOffset=end_offset,
word=display_word))
return out

def word_diff(a, b):
Expand Down
7 changes: 4 additions & 3 deletions gentle/forced_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from gentle import language_model
from gentle import metasentence
from gentle import multipass
from gentle.transcription import MultiThreadedTranscriber, Transcription
from gentle.transcriber import MultiThreadedTranscriber
from gentle.transcription import Transcription

class ForcedAligner():

Expand Down Expand Up @@ -31,14 +32,14 @@ def transcribe(self, wavfile, progress_cb=None, logging=None):

# Perform a second-pass with unaligned words
if logging is not None:
logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.get("case") == "not-found-in-audio"]), len(words)))
logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.case == "not-found-in-audio"]), len(words)))

if progress_cb is not None:
progress_cb({'status': 'ALIGNING'})

words = multipass.realign(wavfile, words, self.ms, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb)

if logging is not None:
logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.get("case") == "not-found-in-audio"]), len(words)))
logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.case == "not-found-in-audio"]), len(words)))

return Transcription(words=words, transcript=self.transcript)
24 changes: 13 additions & 11 deletions gentle/full_transcriber.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os

from gentle import kaldi_queue
from gentle.transcription import MultiThreadedTranscriber, Transcription
from gentle import transcription
from gentle.transcriber import MultiThreadedTranscriber
from gentle.transcription import Transcription

class FullTranscriber():

Expand All @@ -24,17 +26,17 @@ def make_transcription_alignment(trans):
transcript = ""
words = []
for t_wd in trans:
word = {
"case": "success",
"startOffset": len(transcript),
"endOffset": len(transcript) + len(t_wd["word"]),
"word": t_wd["word"],
"alignedWord": t_wd["word"],
"phones": t_wd["phones"],
"start": t_wd["start"],
"end": t_wd["start"] + t_wd["duration"]}
word = transcription.Word(
case="success",
startOffset=len(transcript),
endOffset=len(transcript) + len(t_wd.word),
word=t_wd.word,
alignedWord=t_wd.word,
phones=t_wd.phones,
start=t_wd.start,
end=t_wd.start + t_wd.duration)
words.append(word)

transcript += word["word"] + " "
transcript += word.word + " "

return Transcription(words=words, transcript=transcript)
34 changes: 19 additions & 15 deletions gentle/multipass.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@
from gentle import metasentence
from gentle import language_model
from gentle import diff_align
from gentle import transcription

def prepare_multipass(alignment):
to_realign = []
last_aligned_word = None
cur_unaligned_words = []

for wd_idx,wd in enumerate(alignment):
if wd['case'] == 'not-found-in-audio':
if wd.case == 'not-found-in-audio':
cur_unaligned_words.append(wd)
elif wd['case'] == 'success':
elif wd.case == 'success':
if len(cur_unaligned_words) > 0:
to_realign.append({
"start": last_aligned_word,
Expand All @@ -41,21 +42,24 @@ def realign(wavfile, alignment, ms, resources, nthreads=4, progress_cb=None):
def realign(chunk):
wav_obj = wave.open(wavfile, 'r')

start_t = (chunk["start"] or {"end": 0})["end"]
end_t = chunk["end"]
if end_t is None:
if chunk["start"] is None:
start_t = 0
else:
start_t = chunk["start"].end

if chunk["end"] is None:
end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
else:
end_t = end_t["start"]
end_t = chunk["end"].start

duration = end_t - start_t
if duration < 0.01 or duration > 60:
logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration))
return

# Create a language model
offset_offset = chunk['words'][0]['startOffset']
chunk_len = chunk['words'][-1]['endOffset'] - offset_offset
offset_offset = chunk['words'][0].startOffset
chunk_len = chunk['words'][-1].endOffset - offset_offset
chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8")
chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
chunk_ks = chunk_ms.get_kaldi_sequence()
Expand All @@ -71,21 +75,21 @@ def realign(chunk):
buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

k.push_chunk(buf)
ret = k.get_final()
ret = [transcription.Word(**wd) for wd in k.get_final()]
k.stop()

word_alignment = diff_align.align(ret, chunk_ms)

# Adjust startOffset, endOffset, and timing to match originals
for wd in word_alignment:
if wd.get("end"):
if wd.end is not None:
# Apply timing offset
wd['start'] += start_t
wd['end'] += start_t
wd.start += start_t
wd.end += start_t

if wd.get("endOffset"):
wd['startOffset'] += offset_offset
wd['endOffset'] += offset_offset
if wd.endOffset is not None:
wd.startOffset += offset_offset
wd.endOffset += offset_offset

# "chunk" should be replaced by "words"
realignments.append({"chunk": chunk, "words": word_alignment})
Expand Down
1 change: 0 additions & 1 deletion gentle/standard_kaldi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import tempfile
import wave

from gentle import ffmpeg
from util.paths import get_binary
from gentle.rpc import RPCProtocol
from gentle.resources import Resources
Expand Down
84 changes: 84 additions & 0 deletions gentle/transcriber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import math
import logging
import wave

from gentle import transcription

from multiprocessing.pool import ThreadPool as Pool

class MultiThreadedTranscriber:
def __init__(self, kaldi_queue, chunk_len=20, overlap_t=2, nthreads=4):
self.chunk_len = chunk_len
self.overlap_t = overlap_t
self.nthreads = nthreads

self.kaldi_queue = kaldi_queue

def transcribe(self, wavfile, progress_cb=None):
wav_obj = wave.open(wavfile, 'r')
duration = wav_obj.getnframes() / float(wav_obj.getframerate())
n_chunks = int(math.ceil(duration / float(self.chunk_len - self.overlap_t)))

chunks = []

def transcribe_chunk(idx):
wav_obj = wave.open(wavfile, 'r')
start_t = idx * (self.chunk_len - self.overlap_t)
# Seek
wav_obj.setpos(int(start_t * wav_obj.getframerate()))
# Read frames
buf = wav_obj.readframes(int(self.chunk_len * wav_obj.getframerate()))

k = self.kaldi_queue.get()
k.push_chunk(buf)
ret = k.get_final()
k.reset()
self.kaldi_queue.put(k)

chunks.append({"start": start_t, "words": ret})
logging.info('%d/%d' % (len(chunks), n_chunks))
if progress_cb is not None:
progress_cb({"message": ' '.join([X['word'] for X in ret]),
"percent": len(chunks) / float(n_chunks)})


pool = Pool(min(n_chunks, self.nthreads))
pool.map(transcribe_chunk, range(n_chunks))
pool.close()

chunks.sort(key=lambda x: x['start'])

# Combine chunks
# TODO: remove overlap? ...or just let the sequence aligner deal with it.
words = []
for c in chunks:
chunk_start = c['start']
for wd in c['words']:
wd['start'] += chunk_start
words.append(transcription.Word(**wd))

return words


if __name__=='__main__':
# full transcription
from Queue import Queue
from util import ffmpeg
from gentle import standard_kaldi

import sys

import logging
logging.getLogger().setLevel('INFO')

k_queue = Queue()
for i in range(3):
k_queue.put(standard_kaldi.Kaldi())

trans = MultiThreadedTranscriber(k_queue)

with gentle.resampled(sys.argv[1]) as filename:
out = trans.transcribe(filename)

open(sys.argv[2], 'w').write(out.to_json())

Loading