Skip to content

Commit

Permalink
feat: add tdrzEnable option (#264)
Browse files Browse the repository at this point in the history
* feat: add tdrzEnable option

* fix(ts): docs

* chore: cleanup

* feat: minor refactor

* fix(android): isTdrzEnable

---------

Co-authored-by: Soory Ranganathan <[email protected]>
Co-authored-by: Jhen <[email protected]>
  • Loading branch information
3 people authored Nov 2, 2024
1 parent 5b943a5 commit 7f495e6
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 52 deletions.
15 changes: 14 additions & 1 deletion android/src/main/java/com/rnwhisper/WhisperContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ public class WhisperContext {
private boolean isCapturing = false;
private boolean isStoppedByAction = false;
private boolean isTranscribing = false;
private boolean isTdrzEnable = false;
private Thread rootFullHandler = null;
private Thread fullHandler = null;

Expand All @@ -73,6 +74,7 @@ private void rewind() {
isCapturing = false;
isStoppedByAction = false;
isTranscribing = false;
isTdrzEnable = false;
rootFullHandler = null;
fullHandler = null;
}
Expand Down Expand Up @@ -113,6 +115,8 @@ public int startRealtimeTranscribe(int jobId, ReadableMap options) {
double realtimeAudioMinSec = options.hasKey("realtimeAudioMinSec") ? options.getDouble("realtimeAudioMinSec") : 0;
final double audioMinSec = realtimeAudioMinSec > 0.5 && realtimeAudioMinSec <= audioSliceSec ? realtimeAudioMinSec : 1;

this.isTdrzEnable = options.hasKey("tdrzEnable") && options.getBoolean("tdrzEnable");

createRealtimeTranscribeJob(jobId, context, options);

sliceNSamples = new ArrayList<Integer>();
Expand Down Expand Up @@ -333,8 +337,9 @@ public WritableMap transcribeInputStream(int jobId, InputStream inputStream, Rea
throw new Exception("Context is already in capturing or transcribing");
}
rewind();

this.jobId = jobId;
this.isTdrzEnable = options.hasKey("tdrzEnable") && options.getBoolean("tdrzEnable");

isTranscribing = true;
float[] audioData = AudioUtils.decodeWaveFile(inputStream);

Expand Down Expand Up @@ -368,8 +373,15 @@ private WritableMap getTextSegments(int start, int count) {

WritableMap data = Arguments.createMap();
WritableArray segments = Arguments.createArray();

for (int i = 0; i < count; i++) {
String text = getTextSegment(context, i);

// If tdrzEnable is enabled and speaker turn is detected
if (this.isTdrzEnable && getTextSegmentSpeakerTurnNext(context, i)) {
text += " [SPEAKER_TURN]";
}

builder.append(text);

WritableMap segment = Arguments.createMap();
Expand Down Expand Up @@ -499,6 +511,7 @@ protected static native int fullWithNewJob(
protected static native String getTextSegment(long context, int index);
protected static native int getTextSegmentT0(long context, int index);
protected static native int getTextSegmentT1(long context, int index);
protected static native boolean getTextSegmentSpeakerTurnNext(long context, int index);

protected static native void createRealtimeTranscribeJob(
int job_id,
Expand Down
10 changes: 10 additions & 0 deletions android/src/main/jni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ struct whisper_full_params createFullParams(JNIEnv *env, jobject options) {
params.translate = readablemap::getBool(env, options, "translate", false);
params.speed_up = readablemap::getBool(env, options, "speedUp", false);
params.token_timestamps = readablemap::getBool(env, options, "tokenTimestamps", false);
params.tdrz_enable = readablemap::getBool(env, options, "tdrzEnable", false);
params.offset_ms = 0;
params.no_context = true;
params.single_segment = false;
Expand Down Expand Up @@ -493,4 +494,13 @@ Java_com_rnwhisper_WhisperContext_freeContext(
whisper_free(context);
}

JNIEXPORT jboolean JNICALL
Java_com_rnwhisper_WhisperContext_getTextSegmentSpeakerTurnNext(
JNIEnv *env, jobject thiz, jlong context_ptr, jint index) {
UNUSED(env);
UNUSED(thiz);
struct whisper_context *context = reinterpret_cast<struct whisper_context *>(context_ptr);
return whisper_full_get_segment_speaker_turn_next(context, index);
}

} // extern "C"
37 changes: 19 additions & 18 deletions docs/API/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ whisper.rn

#### Defined in

[index.ts:76](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L76)
[index.ts:76](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L76)

___

Expand All @@ -80,7 +80,7 @@ ___

#### Defined in

[index.ts:441](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L441)
[index.ts:441](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L441)

___

Expand All @@ -90,7 +90,7 @@ ___

#### Defined in

[index.ts:59](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L59)
[index.ts:59](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L59)

___

Expand All @@ -108,7 +108,7 @@ ___

#### Defined in

[index.ts:52](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L52)
[index.ts:52](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L52)

___

Expand All @@ -127,7 +127,7 @@ ___

#### Defined in

[index.ts:45](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L45)
[index.ts:45](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L45)

___

Expand All @@ -149,6 +149,7 @@ ___
| `offset?` | `number` | Time offset in milliseconds |
| `prompt?` | `string` | Initial Prompt |
| `speedUp?` | `boolean` | Speed up audio by x2 (reduced accuracy) |
| `tdrzEnable?` | `boolean` | Enable tinydiarize (requires a tdrz model) |
| `temperature?` | `number` | Tnitial decoding temperature |
| `temperatureInc?` | `number` | - |
| `tokenTimestamps?` | `boolean` | Enable token-level timestamps |
Expand All @@ -157,7 +158,7 @@ ___

#### Defined in

[NativeRNWhisper.ts:5](https://github.com/mybigday/whisper.rn/blob/85066fc/src/NativeRNWhisper.ts#L5)
[NativeRNWhisper.ts:5](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/NativeRNWhisper.ts#L5)

___

Expand All @@ -175,7 +176,7 @@ ___

#### Defined in

[index.ts:70](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L70)
[index.ts:70](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L70)

___

Expand All @@ -200,7 +201,7 @@ ___

#### Defined in

[index.ts:138](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L138)
[index.ts:138](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L138)

___

Expand All @@ -218,7 +219,7 @@ ___

#### Defined in

[index.ts:171](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L171)
[index.ts:171](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L171)

___

Expand All @@ -242,7 +243,7 @@ ___

#### Defined in

[index.ts:158](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L158)
[index.ts:158](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L158)

___

Expand All @@ -252,7 +253,7 @@ ___

#### Defined in

[index.ts:84](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L84)
[index.ts:84](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L84)

___

Expand All @@ -270,7 +271,7 @@ ___

#### Defined in

[NativeRNWhisper.ts:37](https://github.com/mybigday/whisper.rn/blob/85066fc/src/NativeRNWhisper.ts#L37)
[NativeRNWhisper.ts:39](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/NativeRNWhisper.ts#L39)

## Variables

Expand All @@ -295,7 +296,7 @@ AudioSession Utility, iOS only.

#### Defined in

[AudioSessionIos.ts:50](https://github.com/mybigday/whisper.rn/blob/85066fc/src/AudioSessionIos.ts#L50)
[AudioSessionIos.ts:50](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/AudioSessionIos.ts#L50)

___

Expand All @@ -307,7 +308,7 @@ Is allow fallback to CPU if load CoreML model failed

#### Defined in

[index.ts:543](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L543)
[index.ts:543](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L543)

___

Expand All @@ -319,7 +320,7 @@ Is use CoreML models on iOS

#### Defined in

[index.ts:540](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L540)
[index.ts:540](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L540)

___

Expand All @@ -331,7 +332,7 @@ Current version of whisper.cpp

#### Defined in

[index.ts:535](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L535)
[index.ts:535](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L535)

## Functions

Expand All @@ -351,7 +352,7 @@ Current version of whisper.cpp

#### Defined in

[index.ts:467](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L467)
[index.ts:467](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L467)

___

Expand All @@ -365,4 +366,4 @@ ___

#### Defined in

[index.ts:530](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L530)
[index.ts:530](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L530)
14 changes: 7 additions & 7 deletions docs/API/classes/WhisperContext.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

#### Defined in

[index.ts:195](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L195)
[index.ts:195](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L195)

## Properties

Expand All @@ -44,7 +44,7 @@

#### Defined in

[index.ts:191](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L191)
[index.ts:191](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L191)

___

Expand All @@ -54,7 +54,7 @@ ___

#### Defined in

[index.ts:189](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L189)
[index.ts:189](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L189)

___

Expand All @@ -64,7 +64,7 @@ ___

#### Defined in

[index.ts:193](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L193)
[index.ts:193](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L193)

## Methods

Expand All @@ -78,7 +78,7 @@ ___

#### Defined in

[index.ts:436](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L436)
[index.ts:436](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L436)

___

Expand Down Expand Up @@ -106,7 +106,7 @@ Transcribe audio file

#### Defined in

[index.ts:206](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L206)
[index.ts:206](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L206)

___

Expand All @@ -128,4 +128,4 @@ Transcribe the microphone audio stream, the microphone user permission is requir

#### Defined in

[index.ts:302](https://github.com/mybigday/whisper.rn/blob/85066fc/src/index.ts#L302)
[index.ts:302](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/index.ts#L302)
12 changes: 6 additions & 6 deletions docs/API/enums/AudioSessionCategoryIos.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ https://developer.apple.com/documentation/avfaudio/avaudiosessioncategory?langua

#### Defined in

[AudioSessionIos.ts:8](https://github.com/mybigday/whisper.rn/blob/85066fc/src/AudioSessionIos.ts#L8)
[AudioSessionIos.ts:8](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/AudioSessionIos.ts#L8)

___

Expand All @@ -35,7 +35,7 @@ ___

#### Defined in

[AudioSessionIos.ts:13](https://github.com/mybigday/whisper.rn/blob/85066fc/src/AudioSessionIos.ts#L13)
[AudioSessionIos.ts:13](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/AudioSessionIos.ts#L13)

___

Expand All @@ -45,7 +45,7 @@ ___

#### Defined in

[AudioSessionIos.ts:12](https://github.com/mybigday/whisper.rn/blob/85066fc/src/AudioSessionIos.ts#L12)
[AudioSessionIos.ts:12](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/AudioSessionIos.ts#L12)

___

Expand All @@ -55,7 +55,7 @@ ___

#### Defined in

[AudioSessionIos.ts:10](https://github.com/mybigday/whisper.rn/blob/85066fc/src/AudioSessionIos.ts#L10)
[AudioSessionIos.ts:10](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/AudioSessionIos.ts#L10)

___

Expand All @@ -65,7 +65,7 @@ ___

#### Defined in

[AudioSessionIos.ts:11](https://github.com/mybigday/whisper.rn/blob/85066fc/src/AudioSessionIos.ts#L11)
[AudioSessionIos.ts:11](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/AudioSessionIos.ts#L11)

___

Expand All @@ -75,4 +75,4 @@ ___

#### Defined in

[AudioSessionIos.ts:9](https://github.com/mybigday/whisper.rn/blob/85066fc/src/AudioSessionIos.ts#L9)
[AudioSessionIos.ts:9](https://github.com/mybigday/whisper.rn/blob/8f61e46/src/AudioSessionIos.ts#L9)
Loading

0 comments on commit 7f495e6

Please sign in to comment.