Add C++ runtime for MeloTTS (#1138)

k2-fsa · Jul 16, 2024 · 960eb75 · 960eb75
1 parent 9548541
commit 960eb75
Show file tree

Hide file tree

Showing 51 changed files with 693 additions and 156 deletions.
diff --git a/.github/workflows/export-melo-tts-to-onnx.yaml b/.github/workflows/export-melo-tts-to-onnx.yaml
@@ -63,10 +63,16 @@ jobs:
             echo "pwd: $PWD"
             ls -lh ../scripts/melo-tts
 
+            rm -rf ./
+
             cp -v ../scripts/melo-tts/*.onnx .
             cp -v ../scripts/melo-tts/lexicon.txt .
             cp -v ../scripts/melo-tts/tokens.txt .
+            cp -v ../scripts/melo-tts/README.md .
+
+            curl -SL -O https://raw.githubusercontent.com/myshell-ai/MeloTTS/main/LICENSE
 
+            curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/new_heteronym.fst
             curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
             curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
             curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
@@ -77,6 +83,10 @@ jobs:
             git lfs track "*.onnx"
             git add .
 
+            ls -lh
+
+            git status
+
             git commit -m "add models"
             git push https://csukuangfj:[email protected]/csukuangfj/vits-melo-tts-zh_en main || true
 

diff --git a/.github/workflows/windows-x64-jni.yaml b/.github/workflows/windows-x64-jni.yaml
@@ -39,10 +39,14 @@ jobs:
           cd build
           cmake \
             -A x64 \
-            -D CMAKE_BUILD_TYPE=Release \
-            -D BUILD_SHARED_LIBS=ON \
+            -DBUILD_SHARED_LIBS=ON \
             -D SHERPA_ONNX_ENABLE_JNI=ON \
-            -D CMAKE_INSTALL_PREFIX=./install \
+            -DCMAKE_INSTALL_PREFIX=./install \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
+            -DBUILD_ESPEAK_NG_EXE=OFF \
+            -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF  \
+            -DSHERPA_ONNX_ENABLE_BINARY=ON \
             ..
 
       - name: Build sherpa-onnx for windows

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 1.10.16
+
+* Support zh-en TTS model from MeloTTS.
+
 ## 1.10.15
 
 * Downgrade onnxruntime from v1.18.1 to v1.17.1

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,7 +11,7 @@ project(sherpa-onnx)
 # ./nodejs-addon-examples
 # ./dart-api-examples/
 # ./CHANGELOG.md
-set(SHERPA_ONNX_VERSION "1.10.15")
+set(SHERPA_ONNX_VERSION "1.10.16")
 
 # Disable warning about
 #

diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml
@@ -10,7 +10,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.15
+  sherpa_onnx: ^1.10.16
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml
@@ -11,7 +11,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.15
+  sherpa_onnx: ^1.10.16
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml
@@ -8,7 +8,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.15
+  sherpa_onnx: ^1.10.16
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-  sherpa_onnx: ^1.10.15
+  sherpa_onnx: ^1.10.16
   path: ^1.9.0
   args: ^2.5.0
 

diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none'
 
-version: 1.10.14
+version: 1.10.16
 
 topics:
   - speech-recognition
@@ -30,7 +30,7 @@ dependencies:
   record: ^5.1.0
   url_launcher: ^6.2.6
 
-  sherpa_onnx: ^1.10.15
+  sherpa_onnx: ^1.10.16
   # sherpa_onnx:
     # path: ../../flutter/sherpa_onnx
 

diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none' # Remove this line if you wish to publish to pub.dev
 
-version: 1.0.0
+version: 1.10.16
 
 environment:
   sdk: '>=3.4.0 <4.0.0'
@@ -17,7 +17,7 @@ dependencies:
   cupertino_icons: ^1.0.6
   path_provider: ^2.1.3
   path: ^1.9.0
-  sherpa_onnx: ^1.10.15
+  sherpa_onnx: ^1.10.16
   url_launcher: ^6.2.6
   audioplayers: ^5.0.0
 

diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
-version: 1.10.15
+version: 1.10.16
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 
@@ -30,19 +30,19 @@ dependencies:
   flutter:
     sdk: flutter
 
-  sherpa_onnx_android: ^1.10.15
+  sherpa_onnx_android: ^1.10.16
     # path: ../sherpa_onnx_android
 
-  sherpa_onnx_macos: ^1.10.15
+  sherpa_onnx_macos: ^1.10.16
     # path: ../sherpa_onnx_macos
 
-  sherpa_onnx_linux: ^1.10.15
+  sherpa_onnx_linux: ^1.10.16
     # path: ../sherpa_onnx_linux
     #
-  sherpa_onnx_windows: ^1.10.15
+  sherpa_onnx_windows: ^1.10.16
     # path: ../sherpa_onnx_windows
 
-  sherpa_onnx_ios: ^1.10.15
+  sherpa_onnx_ios: ^1.10.16
   # sherpa_onnx_ios:
     # path: ../sherpa_onnx_ios
 

diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
@@ -7,7 +7,7 @@
 # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_ios'
-  s.version          = '1.10.15'
+  s.version          = '1.10.16'
   s.summary          = 'A new Flutter FFI plugin project.'
   s.description      = <<-DESC
 A new Flutter FFI plugin project.

diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_macos'
-  s.version          = '1.10.15'
+  s.version          = '1.10.16'
   s.summary          = 'sherpa-onnx Flutter FFI plugin project.'
   s.description      = <<-DESC
 sherpa-onnx Flutter FFI plugin project.

diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
   "dependencies": {
-    "sherpa-onnx-node": "^1.10.15"
+    "sherpa-onnx-node": "^1.10.16"
   }
 }
diff --git a/scripts/apk/build-apk-tts-engine.sh.in b/scripts/apk/build-apk-tts-engine.sh.in
@@ -78,6 +78,10 @@ sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt
 git diff
 popd
 
+if [[ $model_dir == vits-melo-tts-zh_en ]]; then
+  lang=zh_en
+fi
+
 for arch in arm64-v8a armeabi-v7a x86_64 x86; do
   log "------------------------------------------------------------"
   log "build tts apk for $arch"

diff --git a/scripts/apk/build-apk-tts.sh.in b/scripts/apk/build-apk-tts.sh.in
@@ -76,6 +76,10 @@ sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt
 git diff
 popd
 
+if [[ $model_dir == vits-melo-tts-zh_en ]]; then
+  lang=zh_en
+fi
+
 for arch in arm64-v8a armeabi-v7a x86_64 x86; do
   log "------------------------------------------------------------"
   log "build tts apk for $arch"

diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py
@@ -312,6 +312,11 @@ def get_vits_models() -> List[TtsModel]:
             model_name="vits-zh-hf-fanchen-wnj.onnx",
             lang="zh",
         ),
+        TtsModel(
+            model_dir="vits-melo-tts-zh_en",
+            model_name="model.onnx",
+            lang="zh",
+        ),
         TtsModel(
             model_dir="vits-zh-hf-fanchen-C",
             model_name="vits-zh-hf-fanchen-C.onnx",
@@ -339,18 +344,21 @@ def get_vits_models() -> List[TtsModel]:
         ),
     ]
 
-    rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"]
+    rule_fsts = ["phone.fst", "date.fst", "number.fst"]
     for m in chinese_models:
         s = [f"{m.model_dir}/{r}" for r in rule_fsts]
-        if "vits-zh-hf" in m.model_dir or "sherpa-onnx-vits-zh-ll" == m.model_dir:
+        if (
+            "vits-zh-hf" in m.model_dir
+            or "sherpa-onnx-vits-zh-ll" == m.model_dir
+            or "melo-tts" in m.model_dir
+        ):
             s = s[:-1]
             m.dict_dir = m.model_dir + "/dict"
+        else:
+            m.rule_fars = f"{m.model_dir}/rule.far"
 
         m.rule_fsts = ",".join(s)
 
-        if "vits-zh-hf" not in m.model_dir and "zh-ll" not in m.model_dir:
-            m.rule_fars = f"{m.model_dir}/rule.far"
-
     all_models = chinese_models + [
         TtsModel(
             model_dir="vits-cantonese-hf-xiaomaiiwn",

diff --git a/scripts/dart/sherpa-onnx-pubspec.yaml b/scripts/dart/sherpa-onnx-pubspec.yaml
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
-version: 1.10.15
+version: 1.10.16
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 

diff --git a/scripts/flutter/generate-tts.py b/scripts/flutter/generate-tts.py
@@ -6,9 +6,6 @@
 
 import jinja2
 
-# pip install iso639-lang
-from iso639 import Lang
-
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -37,13 +34,6 @@ class TtsModel:
     data_dir: Optional[str] = None
     dict_dir: Optional[str] = None
     is_char: bool = False
-    lang_iso_639_3: str = ""
-
-
-def convert_lang_to_iso_639_3(models: List[TtsModel]):
-    for m in models:
-        if m.lang_iso_639_3 == "":
-            m.lang_iso_639_3 = Lang(m.lang).pt3
 
 
 def get_coqui_models() -> List[TtsModel]:
@@ -312,6 +302,11 @@ def get_vits_models() -> List[TtsModel]:
             model_name="vits-zh-hf-fanchen-wnj.onnx",
             lang="zh",
         ),
+        TtsModel(
+            model_dir="vits-melo-tts-zh_en",
+            model_name="model.onnx",
+            lang="zh_en",
+        ),
         TtsModel(
             model_dir="vits-zh-hf-fanchen-C",
             model_name="vits-zh-hf-fanchen-C.onnx",
@@ -332,26 +327,33 @@ def get_vits_models() -> List[TtsModel]:
             model_name="vits-zh-hf-fanchen-unity.onnx",
             lang="zh",
         ),
+        TtsModel(
+            model_dir="sherpa-onnx-vits-zh-ll",
+            model_name="model.onnx",
+            lang="zh",
+        ),
     ]
 
-    rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"]
+    rule_fsts = ["phone.fst", "date.fst", "number.fst"]
     for m in chinese_models:
         s = [f"{m.model_dir}/{r}" for r in rule_fsts]
-        if "vits-zh-hf" in m.model_dir:
+        if (
+            "vits-zh-hf" in m.model_dir
+            or "sherpa-onnx-vits-zh-ll" == m.model_dir
+            or "melo-tts" in m.model_dir
+        ):
             s = s[:-1]
             m.dict_dir = m.model_dir + "/dict"
+        else:
+            m.rule_fars = f"{m.model_dir}/rule.far"
 
         m.rule_fsts = ",".join(s)
 
-        if "vits-zh-hf" not in m.model_dir:
-            m.rule_fars = f"{m.model_dir}/rule.far"
-
     all_models = chinese_models + [
         TtsModel(
             model_dir="vits-cantonese-hf-xiaomaiiwn",
             model_name="vits-cantonese-hf-xiaomaiiwn.onnx",
             lang="cantonese",
-            lang_iso_639_3="yue",
             rule_fsts="vits-cantonese-hf-xiaomaiiwn/rule.fst",
         ),
         # English (US)
@@ -374,7 +376,6 @@ def main():
     all_model_list += get_piper_models()
     all_model_list += get_mimic3_models()
     all_model_list += get_coqui_models()
-    convert_lang_to_iso_639_3(all_model_list)
 
     num_models = len(all_model_list)
 

diff --git a/scripts/melo-tts/README.md b/scripts/melo-tts/README.md
@@ -0,0 +1,6 @@
+# Introduction
+
+Models in this directory are converted from
+https://github.com/myshell-ai/MeloTTS
+
+Note there is only a single female speaker in the model.
diff --git a/scripts/melo-tts/export-onnx.py b/scripts/melo-tts/export-onnx.py
@@ -8,7 +8,6 @@
 from melo.text.chinese import pinyin_to_symbol_map
 from melo.text.english import eng_dict, refine_syllables
 from pypinyin import Style, lazy_pinyin, phrases_dict, pinyin_dict
-from melo.text.symbols import language_tone_start_map
 
 for k, v in pinyin_to_symbol_map.items():
     if isinstance(v, list):
@@ -82,6 +81,7 @@ def generate_tokens(symbol_list):
 def generate_lexicon():
     word_dict = pinyin_dict.pinyin_dict
     phrases = phrases_dict.phrases_dict
+    eng_dict["kaldi"] = [["K", "AH0"], ["L", "D", "IH0"]]
     with open("lexicon.txt", "w", encoding="utf-8") as f:
         for word in eng_dict:
             phones, tones = refine_syllables(eng_dict[word])
@@ -237,9 +237,11 @@ def main():
     meta_data = {
         "model_type": "melo-vits",
         "comment": "melo",
+        "version": 2,
         "language": "Chinese + English",
         "add_blank": int(model.hps.data.add_blank),
         "n_speakers": 1,
+        "jieba": 1,
         "sample_rate": model.hps.data.sampling_rate,
         "bert_dim": 1024,
         "ja_bert_dim": 768,

diff --git a/scripts/melo-tts/run.sh b/scripts/melo-tts/run.sh
@@ -12,7 +12,7 @@ function install() {
   cd MeloTTS
   pip install -r ./requirements.txt
 
-  pip install soundfile onnx onnxruntime
+  pip install soundfile onnx==1.15.0 onnxruntime==1.16.3
 
   python3 -m unidic download
   popd