Skip to content

Commit

Permalink
Text to speech API for Object Pascal. (#1273)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Aug 20, 2024
1 parent e34a1a2 commit 5a2aa11
Show file tree
Hide file tree
Showing 14 changed files with 905 additions and 22 deletions.
26 changes: 21 additions & 5 deletions .github/workflows/pascal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,29 @@ jobs:
cp -v install/lib/*.dll ../pascal-api-examples/vad
cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
fi
- name: Run Pascal test (TTS)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd tts
./run-piper.sh
rm -rf vits-piper-*
ls -lh
echo "---"
popd
- name: Run Pascal test (VAD + non-streaming ASR)
shell: bash
run: |
Expand Down
1 change: 1 addition & 0 deletions pascal-api-examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
link*.res
2 changes: 2 additions & 0 deletions pascal-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|[vad](./vad)| It shows how to use the voice activity detection API.|
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
|[portaudio-test](./portaudio-test)| It shows how to use PortAudio for recording and playing.|
|[tts](./tts)| It shows how to use the text-to-speech API.|
4 changes: 4 additions & 0 deletions pascal-api-examples/tts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
!run-*.sh
piper
piper-playback
link*.res
9 changes: 9 additions & 0 deletions pascal-api-examples/tts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Introduction

This directory contains examples for how to use the TTS (text to speech) APIs.

|Directory| Description|
|---------|------------|
|[run-piper.sh](./run-piper.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech.|
|[run-piper-playback.sh](./run-piper-playback.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech. It plays the generated audio as it is still generating. |

238 changes: 238 additions & 0 deletions pascal-api-examples/tts/piper-playback.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
{ Copyright (c) 2024 Xiaomi Corporation }
program piper;
{
This file shows how to use the text to speech API of sherpa-onnx
with Piper models.
It generates speech from text and saves it to a wave file.
Note that it plays the audio back as it is still generating.
}

{$mode objfpc}

uses
{$ifdef unix}
cthreads,
{$endif}
SysUtils,
dos,
ctypes,
portaudio,
sherpa_onnx;

var
CriticalSection: TRTLCriticalSection;

Tts: TSherpaOnnxOfflineTts;
Audio: TSherpaOnnxGeneratedAudio;
Resampler: TSherpaOnnxLinearResampler;

Text: AnsiString;
Speed: Single = 1.0; {Use a larger value to speak faster}
SpeakerId: Integer = 0;
Buffer: TSherpaOnnxCircularBuffer;
FinishedGeneration: Boolean = False;
FinishedPlaying: Boolean = False;

Version: String;
EnvStr: String;
Status: Integer;
NumDevices: Integer;
DeviceIndex: Integer;
DeviceInfo: PPaDeviceInfo;

{ If you get EDivByZero: Division by zero error, please change the sample rate
to the one supported by your microphone.
}
DeviceSampleRate: Integer = 48000;
I: Integer;
Param: TPaStreamParameters;
Stream: PPaStream;
Wave: TSherpaOnnxWave;

function GenerateCallback(
Samples: pcfloat; N: cint32;
Arg: Pointer): cint; cdecl;
begin
EnterCriticalSection(CriticalSection);
try
if Resampler <> nil then
Buffer.Push(Resampler.Resample(Samples, N, False))
else
Buffer.Push(Samples, N);
finally
LeaveCriticalSection(CriticalSection);
end;

{ 1 means to continue generating; 0 means to stop generating. }
Result := 1;
end;

function PlayCallback(
input: Pointer; output: Pointer;
frameCount: culong;
timeInfo: PPaStreamCallbackTimeInfo;
statusFlags: TPaStreamCallbackFlags;
userData: Pointer ): cint; cdecl;
var
Samples: TSherpaOnnxSamplesArray;
I: Integer;
begin
EnterCriticalSection(CriticalSection);
try
if Buffer.Size >= frameCount then
begin
Samples := Buffer.Get(Buffer.Head, FrameCount);
Buffer.Pop(FrameCount);
end
else if Buffer.Size > 0 then
begin
Samples := Buffer.Get(Buffer.Head, Buffer.Size);
Buffer.Pop(Buffer.Size);
SetLength(Samples, frameCount);
end
else
SetLength(Samples, frameCount);

for I := 0 to frameCount - 1 do
pcfloat(output)[I] := Samples[I];

if (Buffer.Size > 0) or (not FinishedGeneration) then
Result := paContinue
else
begin
Result := paComplete;
FinishedPlaying := True;
end;
finally
LeaveCriticalSection(CriticalSection);
end;
end;

function GetOfflineTts: TSherpaOnnxOfflineTts;
var
Config: TSherpaOnnxOfflineTtsConfig;
begin
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
Config.Model.NumThreads := 1;
Config.Model.Debug := False;
Config.MaxNumSentences := 1;

Result := TSherpaOnnxOfflineTts.Create(Config);
end;

begin
Tts := GetOfflineTts;
if Tts.GetSampleRate <> DeviceSampleRate then
Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);

Version := String(Pa_GetVersionText);
WriteLn('Version is ', Version);
Status := Pa_Initialize;
if Status <> paNoError then
begin
WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
Exit;
end;

NumDevices := Pa_GetDeviceCount;
WriteLn('Num devices: ', NumDevices);

DeviceIndex := Pa_GetDefaultOutputDevice;

if DeviceIndex = paNoDevice then
begin
WriteLn('No default output device found');
Pa_Terminate;
Exit;
end;

EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
if EnvStr <> '' then
begin
DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
end;

for I := 0 to (NumDevices - 1) do
begin
DeviceInfo := Pa_GetDeviceInfo(I);
if I = DeviceIndex then
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
else
WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
end;

WriteLn('Use device ', DeviceIndex);
WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);

Initialize(Param);
Param.Device := DeviceIndex;
Param.ChannelCount := 1;
Param.SampleFormat := paFloat32;
param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
param.HostApiSpecificStreamInfo := nil;

Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);


{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
PPaStreamCallback(@PlayCallback), nil);

if Status <> paNoError then
begin
WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;

InitCriticalSection(CriticalSection);

Status := Pa_StartStream(stream);
if Status <> paNoError then
begin
WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;

WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');

Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

Audio := Tts.Generate(Text, SpeakerId, Speed,
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
FinishedGeneration := True;
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
WriteLn('Saved to ./libritts_r-generated.wav');

while not FinishedPlaying do
Pa_Sleep(100); {sleep for 0.1 second }
{TODO(fangjun): Use an event to indicate the play is finished}

DoneCriticalSection(CriticalSection);

FreeAndNil(Tts);
FreeAndNil(Resampler);

Status := Pa_CloseStream(stream);
if Status <> paNoError then
begin
WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
Exit;
end;

Status := Pa_Terminate;
if Status <> paNoError then
begin
WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
Exit;
end;
end.

54 changes: 54 additions & 0 deletions pascal-api-examples/tts/piper.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{ Copyright (c) 2024 Xiaomi Corporation }
program piper;
{
This file shows how to use the text to speech API of sherpa-onnx
with Piper models.
It generates speech from text and saves it to a wave file.
If you want to play it while it is generating, please see
./piper-playback.pas
}

{$mode objfpc}

uses
SysUtils,
sherpa_onnx;

function GetOfflineTts: TSherpaOnnxOfflineTts;
var
Config: TSherpaOnnxOfflineTtsConfig;
begin
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
Config.Model.NumThreads := 1;
Config.Model.Debug := False;
Config.MaxNumSentences := 1;

Result := TSherpaOnnxOfflineTts.Create(Config);
end;

var
Tts: TSherpaOnnxOfflineTts;
Audio: TSherpaOnnxGeneratedAudio;

Text: AnsiString;
Speed: Single = 1.0; {Use a larger value to speak faster}
SpeakerId: Integer = 0;

begin
Tts := GetOfflineTts;

WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');

Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

Audio := Tts.Generate(Text, SpeakerId, Speed);
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
WriteLn('Saved to ./libritts_r-generated.wav');

FreeAndNil(Tts);
end.

45 changes: 45 additions & 0 deletions pascal-api-examples/tts/run-piper-playback.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
popd
fi

if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
rm vits-piper-en_US-libritts_r-medium.tar.bz2
fi

fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
-Fl/usr/local/Cellar/portaudio/19.7.0/lib \
./piper-playback.pas

# Please see ../portaudio-test/README.md
# for how to install portaudio on macOS

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

./piper-playback
Loading

0 comments on commit 5a2aa11

Please sign in to comment.