Skip to content

Commit

Permalink
Use wave stream as input for whisper transcription (#95)
Browse files Browse the repository at this point in the history
  • Loading branch information
ling0322 authored Aug 27, 2024
1 parent e4ea6fe commit 91aeb8b
Show file tree
Hide file tree
Showing 9 changed files with 408 additions and 180 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/cmake-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ jobs:
- name: Install cutlass
run: cd third_party && bash install_cutlass.sh
- name: Build ffmpeg plugin for Windows
run: gcc -shared -o llmpluginffmpeg.dll
run: g++ -shared -o llmpluginffmpeg.dll
-fno-exceptions
-fno-rtti
-Isrc
-Ithird_party/ffmpeg
-DLIBLLM_EXPORTS
Expand Down
6 changes: 5 additions & 1 deletion go/bin/transcribe.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,11 @@ func transcribeMain(args []string) {
slog.Info(fmt.Sprintf("output file is %s", outputFile))

d0 := time.Now()
transcriber := skill.NewWhisperTranscriber(model, inputFile)
transcriber, err := skill.NewWhisperTranscriber(model, inputFile)
if err != nil {
log.Fatal(err)
}

transcriptions := []skill.TranscriptionResult{}
for transcriber.Transcribe() {
r := transcriber.Result()
Expand Down
46 changes: 25 additions & 21 deletions go/ffmpegplugin/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ typedef void *LLM_HMODULE;
typedef HMODULE LLM_HMODULE;
#endif

void *(*p_llm_ffmpeg_plugin_load_library)(const char *library_path) = NULL;
char *(*p_llm_ffmpeg_plugin_get_err)() = NULL;
int32_t (*p_llm_ffmpeg_plugin_read_16khz_mono_pcm_from_media_file)(
const char *filename,
char **output_buffer,
int32_t *output_size) = NULL;
char *(*p_llm_ffmpeg_get_err)() = NULL;
void *(*p_llm_ffmpeg_audio_open)(const char *filename);
void (*p_llm_ffmpeg_audio_close)(void *reader);
int32_t (*p_llm_ffmpeg_audio_read)(void *reader, char *buf, int32_t buf_size);

// load the libllm shared library.
void *llm_ffmpeg_plugin_load_library(const char *libraryPath) {
Expand Down Expand Up @@ -74,16 +72,20 @@ void *llm_ffmpeg_plugin_load_library(const char *libraryPath) {
int llm_ffmpeg_plugin_load_symbols(void *pDll) {
LLM_HMODULE hDll = (LLM_HMODULE)pDll;

LOAD_SYMBOL(hDll, llm_ffmpeg_plugin_get_err);
LOAD_SYMBOL(hDll, llm_ffmpeg_plugin_read_16khz_mono_pcm_from_media_file);
LOAD_SYMBOL(hDll, llm_ffmpeg_get_err);
LOAD_SYMBOL(hDll, llm_ffmpeg_audio_open);
LOAD_SYMBOL(hDll, llm_ffmpeg_audio_close);
LOAD_SYMBOL(hDll, llm_ffmpeg_audio_read);

return 0;
}

// load the libllm shared library.
void llm_ffmpeg_plugin_destroy_librray(void *handle) {
p_llm_ffmpeg_plugin_get_err = NULL;
p_llm_ffmpeg_plugin_read_16khz_mono_pcm_from_media_file = NULL;
void llm_ffmpeg_plugin_destroy_library(void *handle) {
p_llm_ffmpeg_get_err = NULL;
p_llm_ffmpeg_audio_open = NULL;
p_llm_ffmpeg_audio_close = NULL;
p_llm_ffmpeg_audio_read = NULL;

// first try to load the dll from same folder as current module.
#if defined(LUT_PLATFORM_APPLE) || defined(LUT_PLATFORM_LINUX)
Expand All @@ -100,16 +102,18 @@ void llm_ffmpeg_plugin_destroy_librray(void *handle) {
#endif
}

char *llm_ffmpeg_plugin_get_err() {
return p_llm_ffmpeg_plugin_get_err();
void *llm_ffmpeg_audio_open(const char *filename) {
return p_llm_ffmpeg_audio_open(filename);
}

int32_t llm_ffmpeg_plugin_read_16khz_mono_pcm_from_media_file(
const char *filename,
char **output_buffer,
int32_t *output_size) {
return p_llm_ffmpeg_plugin_read_16khz_mono_pcm_from_media_file(
filename,
output_buffer,
output_size);
void llm_ffmpeg_audio_close(void *reader) {
return p_llm_ffmpeg_audio_close(reader);
}

int32_t llm_ffmpeg_audio_read(void *reader, char *buf, int32_t buf_size) {
return p_llm_ffmpeg_audio_read(reader, buf, buf_size);
}

const char *llm_ffmpeg_get_err() {
return p_llm_ffmpeg_get_err();
}
14 changes: 8 additions & 6 deletions go/ffmpegplugin/plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@

#include <stdint.h>

typedef struct llm_ffmpeg_audio_reader_t llm_ffmpeg_audio_reader_t;

void *llm_ffmpeg_plugin_load_library(const char *library_path);
int llm_ffmpeg_plugin_load_symbols(void *handle);
void llm_ffmpeg_plugin_destroy_librray(void *handle);
char *llm_ffmpeg_plugin_get_err();
int32_t llm_ffmpeg_plugin_read_16khz_mono_pcm_from_media_file(
const char *filename,
char **output_buffer,
int32_t *output_size);
void llm_ffmpeg_plugin_destroy_library(void *handle);

const char *llm_ffmpeg_get_err();
void *llm_ffmpeg_audio_open(const char *filename);
void llm_ffmpeg_audio_close(void *reader);
int32_t llm_ffmpeg_audio_read(void *reader, char *buf, int32_t buf_size);
59 changes: 49 additions & 10 deletions go/ffmpegplugin/read_audio.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ package ffmpegplugin
import "C"
import (
"errors"
"io"
"log/slog"
"os"
"path/filepath"
"runtime"
Expand All @@ -39,6 +41,10 @@ var gDll unsafe.Pointer

var Init = sync.OnceValue[error](initIntrnal)

type Reader struct {
handle unsafe.Pointer
}

func initIntrnal() error {
// load the shared library.
binPath, err := os.Executable()
Expand Down Expand Up @@ -74,23 +80,56 @@ func initIntrnal() error {
return nil
}

func Read16KHzMonoPcmFromMediaFile(filename string) (pcmdata []byte, err error) {
func NewReader(filename string) (*Reader, error) {
Init()

if !gInit.Load() {
return nil, errors.New("ffmpeg plugin not initialized")
}

cName := C.CString(filename)
defer C.free(unsafe.Pointer(cName))

var outputPtr *C.char
outputLen := C.int(0)
ret := C.llm_ffmpeg_plugin_read_16khz_mono_pcm_from_media_file(cName, &outputPtr, &outputLen)
if ret < 0 {
err = errors.New(C.GoString(C.llm_ffmpeg_plugin_get_err()))
return
handle := C.llm_ffmpeg_audio_open(cName)
if handle == nil {
return nil, errors.New(C.GoString(C.llm_ffmpeg_get_err()))
}

reader := &Reader{
unsafe.Pointer(handle),
}
runtime.SetFinalizer(reader, func(r *Reader) {
if r.handle != nil {
slog.Warn("ffmpegplugin.Reader is not closed")
r.Close()
}
})
return reader, nil
}

func (r *Reader) Read(b []byte) (n int, err error) {
if r.handle == nil {
return 0, errors.New("llm_ffmpeg_audio_reader_t handle is empty")
}

pcmdata = make([]byte, int(outputLen))
C.memcpy(unsafe.Pointer(&pcmdata[0]), unsafe.Pointer(outputPtr), C.size_t(outputLen))
return
buf := (*C.char)(unsafe.Pointer(&b[0]))
bufsize := C.int32_t(len(b))

nb := C.llm_ffmpeg_audio_read(r.handle, buf, bufsize)
if nb == 0 {
return 0, io.EOF
} else if nb < 0 {
return 0, errors.New(C.GoString(C.llm_ffmpeg_get_err()))
} else {
return int(nb), nil
}
}

func (r *Reader) Close() error {
if r.handle != nil {
C.llm_ffmpeg_audio_close(r.handle)
r.handle = nil
}

return nil
}
107 changes: 106 additions & 1 deletion go/skill/audio.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,118 @@ import (
"bytes"
"errors"
"fmt"
"io"
"log/slog"
"os"
"os/exec"
"path/filepath"
"runtime"
"sync"
"time"

"github.com/ling0322/libllm/go/ffmpegplugin"
)

var gFfmpegBin string
var gFfmpegPluginReady bool

var BlockSize = 60 * 16000 * 2

type WaveStream struct {
reader *ffmpegplugin.Reader
buffer []byte
bufferOffset time.Duration
}

type WaveChunk struct {
begin time.Duration
end time.Duration
eof bool
data []byte
}

func NewWaveStream(filename string) (*WaveStream, error) {
reader, err := ffmpegplugin.NewReader(filename)
if err != nil {
return nil, err
}

return &WaveStream{
reader: reader,
}, nil
}

func durationToBytes(dur time.Duration) int {
nsPerSample := 1000000000 / SampleRate
nSamples := int(dur.Nanoseconds() / int64(nsPerSample))
nBytes := nSamples * 2

return nBytes
}

func (s *WaveStream) ensureOffset(offset time.Duration) error {
if offset < s.bufferOffset {
return errors.New("wave stream could not seek backward")
}

length := offset - s.bufferOffset
for len(s.buffer) < durationToBytes(length) {
b := make([]byte, BlockSize)
n, err := s.reader.Read(b)
if err != nil {
return err
}

s.buffer = append(s.buffer, b[:n]...)
}

return nil
}

func (s *WaveStream) Seek(offset time.Duration) error {
err := s.ensureOffset(offset)
if err != nil {
return err
}

forwardDuration := offset - s.bufferOffset
forwardBytes := durationToBytes(forwardDuration)

s.buffer = s.buffer[forwardBytes:]
s.bufferOffset = offset

return nil
}

func (s *WaveStream) Offset() time.Duration {
return s.bufferOffset
}

func (s *WaveStream) ReadChunk(length time.Duration) (WaveChunk, error) {
err := s.ensureOffset(s.bufferOffset + length)
eof := false
if errors.Is(err, io.EOF) {
eof = true
if len(s.buffer) == 0 {
return WaveChunk{}, io.EOF
}
} else if err != nil {
return WaveChunk{}, err
}

n := min(len(s.buffer), durationToBytes(length))
return WaveChunk{
begin: s.bufferOffset,
end: s.bufferOffset + length,
eof: eof,
data: s.buffer[:n],
}, nil
}

func (s *WaveStream) Close() error {
return s.reader.Close()
}

var initAudioReader = sync.OnceFunc(func() {
err := ffmpegplugin.Init()
if err != nil {
Expand All @@ -52,7 +151,13 @@ var initAudioReader = sync.OnceFunc(func() {

// convert the input file to pcm .wav file in OS temporary directory using ffmpeg.
func convertToPcmPlugin(inputFile string) ([]byte, error) {
return ffmpegplugin.Read16KHzMonoPcmFromMediaFile(inputFile)
reader, err := ffmpegplugin.NewReader(inputFile)
if err != nil {
return nil, err
}
defer reader.Close()

return io.ReadAll(reader)
}

// find the path of ffmpeg.
Expand Down
Loading

0 comments on commit 91aeb8b

Please sign in to comment.