diff --git a/TTS/STV/speech_to_visemes.py b/TTS/STV/speech_to_visemes.py new file mode 100644 index 0000000..11d16dd --- /dev/null +++ b/TTS/STV/speech_to_visemes.py @@ -0,0 +1,341 @@ +"""This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes.""" +from transformers import pipeline +import logging +import numpy as np + +logger = logging.getLogger(__name__) +from typing import List, Dict, Any, Optional + +class SpeechToVisemes(): + """ + Handles the conversion of speech to visemes using a phoneme-to-viseme mapping. + + Attributes: + model_name (str): The name of the model to use for speech recognition. + device (str): The device to run the model on (e.g., "cpu", "mps", "cuda"). + gen_kwargs (dict): Additional generation parameters for the speech recognition pipeline. + asr_pipeline (transformers.Pipeline): The automatic speech recognition pipeline. + """ + + def __init__( + self, + model_name="bookbot/wav2vec2-ljspeech-gruut", + device="mps", + gen_kwargs={}, + ): + """ + Initializes the SpeechToVisemes class with the specified parameters. + + Args: + model_name (str, optional): The name of the model to use for speech recognition. + device (str, optional): The device to run the model on. + gen_kwargs (dict, optional): Additional generation parameters for the speech recognition pipeline. + """ + self.device = device + self.gen_kwargs = gen_kwargs + + # Initialize the automatic speech recognition pipeline + self.asr_pipeline = pipeline( + "automatic-speech-recognition", model=model_name, device=device + ) + + def _map_phonemes_to_visemes( + self, + data: Dict[str, Any], + ) -> List[Dict[str, Any]]: + """ + Maps phonemes to corresponding visemes with timestamps. + + Refer to the following references for more information on the phoneme-to-viseme mapping: + - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes + - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets + + Args: + data (Dict[str, Any]): A dictionary containing phoneme data, where `data['chunks']` + holds a list of phonemes and their timestamps. + + Returns: + List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme + ID and the corresponding timestamp. + """ + + def _phoneme_to_viseme(phoneme: str) -> List[int]: + """ + Converts a phoneme to its corresponding viseme(s). + + Args: + phoneme (str): The phoneme to map to viseme. + + Returns: + List[int]: A list of viseme IDs corresponding to the phoneme. + """ + # inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets + phoneme_viseme_map = { + # basic + 'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], + 'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], + 'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], + 'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0], + + # ar-EG + "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], + "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], + "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], + "m": [21], "n": [19], "r": [13], "ʕ": [12], + + # bg-BG + "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], + "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], + "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], + "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], + "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15], + + # ca-ES + "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], + "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], + "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], + "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], + + # cs-CZ + "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], + "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], + "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], + "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], + "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13], + + # da-DK + "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], + "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], + "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], + "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], + "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7], + + # de-DE/de-CH/de-AT + "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], + "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], + "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], + "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], + "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], + "ʒ": [16], "ʔ": [19], + + # el-GR + "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], + "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], + "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], + "x": [12], "z": [15], + + # en-GB/en-IE/en-AU + "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], + "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], + "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], + "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], + "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # en-US/en-CA + "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], + "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], + "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], + "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], + "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], + "dʒ": [19, 16], "l": [14], "ɹ": [13], + + # es-ES + "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], + "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], + "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], + + # es-MX + "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], + "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], + "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], + + # fi-FI + "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], + "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], + "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], + "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], + "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], + "s": [15], "ʃ": [16], "t": [19], "ʋ": [18], + + # fr-FR/fr-CA/fr-CH + "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], + "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], + "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], + "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15], + + # he-IL + "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], + "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], + "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16], + + # hr-HR + "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], + "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], + "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], + "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16], + + # hu-HU + "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], + "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], + "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], + "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], + "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], + "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], + "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], + "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16], + + # id-ID + "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], + "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], + "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], + "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15], + + # it-IT + "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], + "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], + "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], + "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], + "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], + "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], + "v": [18], "vː": [18], "w": [7], "z": [15], + + # ko-KR + "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], + "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], + "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], + "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], + "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], + "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19], + + # ms-MY + "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], + "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], + "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], + "r": [13], "h": [12], "j": [6], "w": [7], "l": [14], + + # nb-NO + "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], + "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], + "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], + "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], + "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19], + + # nl-NL/nl-BE + "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], + "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], + "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], + "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], + "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16], + + # pl-PL + "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], + "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], + "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], + "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], + "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], + "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16], + + # pt-BR + "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], + "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], + "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], + "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20], + + # pt-PT + "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], + "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], + "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], + "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], + "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], + "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # ro-RO + "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], + "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], + "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], + "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], + "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], + "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16], + + # ru-RU + "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], + "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], + "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], + "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], + "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6], + + # sk-SK + "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], + "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], + "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], + "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], + "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], + "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], + "w": [7], + + # sl-SI + "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], + "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], + "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], + "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], + "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], + "z": [15], + + # sv-SE + "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], + "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], + "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], + "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], + "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19], + + # th-TH + "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], + "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], + "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], + "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], + "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19], + + # tr-TR + "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], + "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], + "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], + "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], + "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # vi-VN + "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], + "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], + "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], + "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], + "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], + "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], + "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7] + } + return phoneme_viseme_map.get(phoneme, []) + + viseme_list = [] + chunks = data.get('chunks', []) + + for i, chunk in enumerate(chunks): + phoneme = chunk.get('text', None) + timestamp = chunk.get('timestamp', None) + visemes = _phoneme_to_viseme(phoneme) + + for viseme in visemes: + viseme_list.append({ + 'viseme': viseme, + 'timestamp': timestamp + }) + + return viseme_list + + + def process(self, audio_file: str) -> List[Dict[str, Any]]: + """Process an audio file and convert speech to visemes.""" + # Perform ASR to get phoneme data + asr_result = self.asr_pipeline(audio_file, return_timestamps='char') + # Map phonemes to visemes + viseme_data = self._map_phonemes_to_visemes(asr_result) + + return viseme_data + \ No newline at end of file diff --git a/TTS/chatTTS_handler.py b/TTS/chatTTS_handler.py index 6bdc6bf..1cee897 100644 --- a/TTS/chatTTS_handler.py +++ b/TTS/chatTTS_handler.py @@ -5,6 +5,7 @@ import numpy as np from rich.console import Console import torch +from .STV.speech_to_visemes import SpeechToVisemes logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -22,6 +23,7 @@ def setup( gen_kwargs={}, # Unused stream=True, chunk_size=512, + viseme_flag = True ): self.should_listen = should_listen self.device = device @@ -33,6 +35,9 @@ def setup( self.params_infer_code = ChatTTS.Chat.InferCodeParams( spk_emb=rnd_spk_emb, ) + self.viseme_flag = viseme_flag + if self.viseme_flag: + self.speech_to_visemes = SpeechToVisemes() self.warmup() def warmup(self): @@ -61,22 +66,65 @@ def process(self, llm_sentence): if gen[0] is None or len(gen[0]) == 0: self.should_listen.set() return + + # Resample the audio to 16000 Hz audio_chunk = librosa.resample(gen[0], orig_sr=24000, target_sr=16000) - audio_chunk = (audio_chunk * 32768).astype(np.int16)[0] - while len(audio_chunk) > self.chunk_size: - yield audio_chunk[: self.chunk_size] # 返回前 chunk_size 字节的数据 - audio_chunk = audio_chunk[self.chunk_size :] # 移除已返回的数据 - yield np.pad(audio_chunk, (0, self.chunk_size - len(audio_chunk))) + # Ensure the audio is converted to mono (single channel) + if len(audio_chunk.shape) > 1: + audio_chunk = librosa.to_mono(audio_chunk) + audio_chunk = (audio_chunk * 32768).astype(np.int16) + + # Process visemes if viseme_flag is set + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + + # Loop through audio chunks, yielding dict for each chunk + for i in range(0, len(audio_chunk), self.chunk_size): + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.chunk_size], + (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), + ) + } + # Include text and visemes for the first chunk + if i == 0: + chunk_data["text"] = llm_sentence # Assuming llm_sentence is defined elsewhere + chunk_data["visemes"] = visemes + + yield chunk_data else: wavs = wavs_gen if len(wavs[0]) == 0: self.should_listen.set() return audio_chunk = librosa.resample(wavs[0], orig_sr=24000, target_sr=16000) + # Ensure the audio is converted to mono (single channel) + if len(audio_chunk.shape) > 1: + audio_chunk = librosa.to_mono(audio_chunk) audio_chunk = (audio_chunk * 32768).astype(np.int16) + + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + for i in range(0, len(audio_chunk), self.chunk_size): - yield np.pad( - audio_chunk[i : i + self.chunk_size], - (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), - ) + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.chunk_size], + (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), + ) + } + # For the first chunk, include text and visemes + if i == 0: + chunk_data["text"] = llm_sentence + chunk_data["visemes"] = visemes + yield chunk_data + self.should_listen.set() diff --git a/TTS/melo_handler.py b/TTS/melo_handler.py index b1b2226..fc33730 100644 --- a/TTS/melo_handler.py +++ b/TTS/melo_handler.py @@ -6,6 +6,8 @@ from rich.console import Console import torch +from .STV.speech_to_visemes import SpeechToVisemes + logger = logging.getLogger(__name__) console = Console() @@ -28,7 +30,6 @@ "ko": "KR", } - class MeloTTSHandler(BaseHandler): def setup( self, @@ -38,6 +39,7 @@ def setup( speaker_to_id="en", gen_kwargs={}, # Unused blocksize=512, + viseme_flag = True # To obtain timestamped visemes ): self.should_listen = should_listen self.device = device @@ -49,6 +51,11 @@ def setup( WHISPER_LANGUAGE_TO_MELO_SPEAKER[speaker_to_id] ] self.blocksize = blocksize + + self.viseme_flag = viseme_flag + if self.viseme_flag: + self.speech_to_visemes = SpeechToVisemes() + self.warmup() def warmup(self): @@ -100,10 +107,25 @@ def process(self, llm_sentence): return audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000) audio_chunk = (audio_chunk * 32768).astype(np.int16) + + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + for i in range(0, len(audio_chunk), self.blocksize): - yield np.pad( - audio_chunk[i : i + self.blocksize], - (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])), - ) + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.blocksize], + (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) + ) + } + # For the first chunk, include text and visemes + if i == 0: + chunk_data["text"] = llm_sentence + chunk_data["visemes"] = visemes + yield chunk_data self.should_listen.set() diff --git a/TTS/parler_handler.py b/TTS/parler_handler.py index e8c6c55..2ba49c9 100644 --- a/TTS/parler_handler.py +++ b/TTS/parler_handler.py @@ -14,6 +14,7 @@ from transformers.utils.import_utils import ( is_flash_attn_2_available, ) +from .STV.speech_to_visemes import SpeechToVisemes torch._inductor.config.fx_graph_cache = True # mind about this parameter ! should be >= 2 * number of padded prompt sizes for TTS @@ -47,6 +48,7 @@ def setup( ), play_steps_s=1, blocksize=512, + viseme_flag = True ): self.should_listen = should_listen self.device = device @@ -78,6 +80,10 @@ def setup( self.model.forward, mode=self.compile_mode, fullgraph=True ) + self.viseme_flag = viseme_flag + if self.viseme_flag: + self.speech_to_visemes = SpeechToVisemes() + self.warmup() def prepare_model_inputs( @@ -182,10 +188,25 @@ def process(self, llm_sentence): ) audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000) audio_chunk = (audio_chunk * 32768).astype(np.int16) + + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + for i in range(0, len(audio_chunk), self.blocksize): - yield np.pad( - audio_chunk[i : i + self.blocksize], - (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])), - ) + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.blocksize], + (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) + ) + } + # For the first chunk, include text and visemes + if i == 0: + chunk_data["text"] = llm_sentence + chunk_data["visemes"] = visemes + yield chunk_data self.should_listen.set() diff --git a/arguments_classes/parler_tts_arguments.py b/arguments_classes/parler_tts_arguments.py index 5159432..1bb0f21 100644 --- a/arguments_classes/parler_tts_arguments.py +++ b/arguments_classes/parler_tts_arguments.py @@ -36,7 +36,7 @@ class ParlerTTSHandlerArguments: tts_gen_max_new_tokens: int = field( default=512, metadata={ - "help": "Maximum number of new tokens to generate in a single completion. Default is 256, which corresponds to ~6 secs" + "help": "Maximum number of new tokens to generate in a single completion. Default is 512, which corresponds to ~6 secs" }, ) description: str = field( diff --git a/connections/local_audio_streamer.py b/connections/local_audio_streamer.py index 389dcb8..d42fbe7 100644 --- a/connections/local_audio_streamer.py +++ b/connections/local_audio_streamer.py @@ -27,7 +27,18 @@ def callback(indata, outdata, frames, time, status): self.input_queue.put(indata.copy()) outdata[:] = 0 * outdata else: - outdata[:] = self.output_queue.get()[:, np.newaxis] + data = self.output_queue.get() + """ + # Check if text data is present and log it + if data.get('text') is not None: + text = data['text'] + logger.info(f"Text: {text}") + # Check if viseme data is present and log it + if data.get('visemes') is not None: + visemes = data['visemes'] + logger.info(f"Visemes: {visemes}") + """ + outdata[:] = data['audio'][:, np.newaxis] logger.debug("Available devices:") logger.debug(sd.query_devices()) diff --git a/connections/socket_sender.py b/connections/socket_sender.py index 11ed210..fb5c7cb 100644 --- a/connections/socket_sender.py +++ b/connections/socket_sender.py @@ -1,6 +1,8 @@ import socket from rich.console import Console import logging +import pickle +import struct logger = logging.getLogger(__name__) @@ -11,7 +13,6 @@ class SocketSender: """ Handles sending generated audio packets to the clients. """ - def __init__(self, stop_event, queue_in, host="0.0.0.0", port=12346): self.stop_event = stop_event self.queue_in = queue_in @@ -28,9 +29,31 @@ def run(self): logger.info("sender connected") while not self.stop_event.is_set(): - audio_chunk = self.queue_in.get() - self.conn.sendall(audio_chunk) - if isinstance(audio_chunk, bytes) and audio_chunk == b"END": - break + data = self.queue_in.get() + packet = {} + if 'audio' in data and data['audio'] is not None: + audio_chunk = data['audio'] + packet['audio'] = data['audio'] + if 'text' in data and data['text'] is not None: + packet['text'] = data['text'] + if 'visemes' in data and data['visemes'] is not None: + packet['visemes'] = data['visemes'] + + # Serialize the packet using pickle + serialized_packet = pickle.dumps(packet) + + # Compute the length of the serialized packet + packet_length = len(serialized_packet) + + # Send the packet length as a 4-byte integer using struct + self.conn.sendall(struct.pack('!I', packet_length)) + + # Send the serialized packet + self.conn.sendall(serialized_packet) + + if 'audio' in data and data['audio'] is not None: + if isinstance(audio_chunk, bytes) and audio_chunk == b"END": + break + self.conn.close() logger.info("Sender closed") diff --git a/listen_and_play.py b/listen_and_play.py index 35eabd6..2082a5e 100644 --- a/listen_and_play.py +++ b/listen_and_play.py @@ -4,15 +4,16 @@ from dataclasses import dataclass, field import sounddevice as sd from transformers import HfArgumentParser - +import struct +import pickle @dataclass class ListenAndPlayArguments: send_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."}) recv_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."}) list_play_chunk_size: int = field( - default=1024, - metadata={"help": "The size of data chunks (in bytes). Default is 1024."}, + default=512, + metadata={"help": "The size of data chunks (in bytes). Default is 512."}, ) host: str = field( default="localhost", @@ -33,7 +34,7 @@ class ListenAndPlayArguments: def listen_and_play( send_rate=16000, recv_rate=44100, - list_play_chunk_size=1024, + list_play_chunk_size=512, host="localhost", send_port=12345, recv_port=12346, @@ -79,9 +80,29 @@ def receive_full_chunk(conn, chunk_size): return data while not stop_event.is_set(): - data = receive_full_chunk(recv_socket, list_play_chunk_size * 2) - if data: - recv_queue.put(data) + # Step 1: Receive the first 4 bytes to get the packet length + length_data = receive_full_chunk(recv_socket, 4) + if not length_data: + continue # Handle disconnection or data not available + + # Step 2: Unpack the length (4 bytes) + packet_length = struct.unpack('!I', length_data)[0] + + # Step 3: Receive the full packet based on the length + serialized_packet = receive_full_chunk(recv_socket, packet_length) + if serialized_packet: + # Step 4: Deserialize the packet using pickle + packet = pickle.loads(serialized_packet) + # Step 5: Extract the packet contents + if 'text' in packet: + pass + # print(packet['text']) + if 'visemes' in packet: + pass + # print(packet['visemes']) + + # Step 6: Put the packet audio data into the queue for sending + recv_queue.put(packet['audio'].tobytes()) try: send_stream = sd.RawInputStream( @@ -123,4 +144,4 @@ def receive_full_chunk(conn, chunk_size): if __name__ == "__main__": parser = HfArgumentParser((ListenAndPlayArguments,)) (listen_and_play_kwargs,) = parser.parse_args_into_dataclasses() - listen_and_play(**vars(listen_and_play_kwargs)) + listen_and_play(**vars(listen_and_play_kwargs)) \ No newline at end of file