Merge pull request #63 from sensein/alistair/add_streamlit_dependency

Remove TTS and convert voice feature
sensein · Jun 25, 2024 · e3f0ceb · e3f0ceb
2 parents 25afa6a + 5d1c158
commit e3f0ceb
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 97 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,8 +25,9 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip setuptools
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install --no-cache-dir --editable=".[dev,tts]"
+          pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install --no-cache-dir --editable=".[dev]"
+
       - name: Test with pytest
         run: |
           pytest tests
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,9 +27,12 @@ dependencies = [
     "matplotlib>=3.8.3",
     "click",
     "pydra~=0.23",
+    "numpy",
+    "sentencepiece",
     "transformers",
     "accelerate",
     "fhir.resources==7.1.0",
+    "streamlit",
     "datasets[audio]"
 ]
 
@@ -45,9 +48,6 @@ doc = [
     "jupytext",
     "ipympl"
 ]
-tts = [
-    "TTS; python_version < '3.12'"
-]
 dev = [
     "b2aiprep[doc]",
     "pytest",

diff --git a/src/b2aiprep/cli.py b/src/b2aiprep/cli.py
@@ -26,14 +26,6 @@
     verify_speaker_from_files,
 )
 
-try:
-    import TTS
-except ImportError:
-    TTS = None
-else:
-    from .process import VoiceConversion
-
-
 @click.group()
 def main():
     pass
@@ -274,35 +266,6 @@ def verify(file1, file2, model, device):
     score, prediction = verify_speaker_from_files(file1, file2, model=model, device=device)
     print(f"Score: {float(score):.2f} Prediction: {bool(prediction)}")
 
-
-if TTS is not None:
-
-    @main.command()
-    @click.argument("source_file", type=click.Path(exists=True))
-    @click.argument("target_voice_file", type=click.Path(exists=True))
-    @click.argument("output_file", type=click.Path())
-    @click.option(
-        "--model_name",
-        type=str,
-        default="voice_conversion_models/multilingual/vctk/freevc24",
-        show_default=True,
-    )
-    @click.option(
-        "--device", type=str, default=None, show_default=True, help="Device to use for inference."
-    )
-    @click.option("--progress_bar", type=bool, default=True, show_default=True)
-    def convert_voice(
-        source_file, target_voice_file, output_file, model_name, device, progress_bar
-    ):
-        """
-        Converts the voice in the source_file to match the voice in the target_voice_file,
-        and saves the output to output_file.
-        """
-        vc = VoiceConversion(model_name=model_name, progress_bar=progress_bar, device=device)
-        vc.convert_voice(source_file, target_voice_file, output_file)
-        print(f"Conversion complete. Output saved to: {output_file}")
-
-
 @main.command()
 @click.argument("audio_file", type=click.Path(exists=True))
 @click.option("--model_id", type=str, default="openai/whisper-tiny", show_default=True)

diff --git a/src/b2aiprep/process.py b/src/b2aiprep/process.py
@@ -18,11 +18,6 @@
 from speechbrain.inference.speaker import EncoderClassifier
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 
-try:
-    from TTS.api import TTS
-except ImportError:
-    TTS = None
-
 warnings.filterwarnings("ignore")
 
 
@@ -386,56 +381,6 @@ def to_features(
 
     return features if return_features else None, outfile, outfig
 
-
-if TTS is not None:
-
-    class VoiceConversion:
-        def __init__(
-            self,
-            model_name: str = "voice_conversion_models/multilingual/vctk/freevc24",
-            progress_bar: bool = True,
-            device: ty.Optional[str] = None,
-        ) -> None:
-            """
-            Initialize the Voice Conversion model.
-
-            :param model_name: Name of the model to be used for voice conversion.
-            :param use_gpu: Boolean indicating whether to use GPU for model computation.
-
-            TODO: Add support for multiple devices.
-            """
-            use_gpu = False
-            if device is not None and "cuda" in device:
-                # If CUDA is available, set use_gpu to True
-                if torch.cuda.is_available():
-                    use_gpu = True
-                # If CUDA is not available, raise an error
-                else:
-                    raise ValueError("CUDA is not available. Please use CPU.")
-
-            self.tts = TTS(model_name=model_name, progress_bar=progress_bar, gpu=use_gpu)
-
-        def convert_voice(self, source_file: str, target_file: str, output_file: str) -> None:
-            """
-            Converts the voice from the source audio file to match the voice in
-            the target audio file.
-
-            :param source_file: Path to the source audio file.
-            :param target_file: Path to the target audio file.
-            :param output_file: Path where the converted audio file will be saved.
-            """
-            if not os.path.exists(source_file):
-                raise FileNotFoundError(f"The source file {source_file} does not exist.")
-            if not os.path.exists(target_file):
-                raise FileNotFoundError(f"The target file {target_file} does not exist.")
-
-            # Perform voice conversion without modifying the source or target audio data directly.
-            with torch.no_grad():
-                self.tts.voice_conversion_to_file(
-                    source_wav=source_file, target_wav=target_file, file_path=output_file
-                )
-
-
 class SpeechToText:
     """
     A class for converting speech to text using a specified speech-to-text model.