Skip to content

Commit

Permalink
feat: support wav base64 for transcribe & add transcribeData for no h…
Browse files Browse the repository at this point in the history
…ead data (#267)

* feat(ios): support wav base64 for transcribe & add transcribeData for no head data

* feat(example): wip

* feat(example): wip in TranscribeData example

* feat(example): update button title

* feat(android): support transcribeData & transcribeFile with base64
  • Loading branch information
jhen0409 authored Nov 9, 2024
1 parent 45afdc7 commit 06ecd04
Show file tree
Hide file tree
Showing 20 changed files with 812 additions and 154 deletions.
39 changes: 27 additions & 12 deletions android/src/main/java/com/rnwhisper/AudioUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,29 @@

import android.util.Log;

import java.io.IOException;
import java.io.FileReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.ShortBuffer;
import java.util.Base64;

import java.util.Arrays;

public class AudioUtils {
private static final String NAME = "RNWhisperAudioUtils";

public static float[] decodeWaveFile(InputStream inputStream) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) != -1) {
baos.write(buffer, 0, bytesRead);
}
ByteBuffer byteBuffer = ByteBuffer.wrap(baos.toByteArray());
private static float[] bufferToFloatArray(byte[] buffer, Boolean cutHeader) {
ByteBuffer byteBuffer = ByteBuffer.wrap(buffer);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
byteBuffer.position(44);
ShortBuffer shortBuffer = byteBuffer.asShortBuffer();
short[] shortArray = new short[shortBuffer.limit()];
shortBuffer.get(shortArray);
if (cutHeader) {
shortArray = Arrays.copyOfRange(shortArray, 44, shortArray.length);
}
float[] floatArray = new float[shortArray.length];
for (int i = 0; i < shortArray.length; i++) {
floatArray[i] = ((float) shortArray[i]) / 32767.0f;
Expand All @@ -36,4 +33,22 @@ public static float[] decodeWaveFile(InputStream inputStream) throws IOException
}
return floatArray;
}
}

public static float[] decodeWaveFile(InputStream inputStream) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) != -1) {
baos.write(buffer, 0, bytesRead);
}
return bufferToFloatArray(baos.toByteArray(), true);
}

public static float[] decodeWaveData(String dataBase64) throws IOException {
return bufferToFloatArray(Base64.getDecoder().decode(dataBase64), true);
}

public static float[] decodePcmData(String dataBase64) {
return bufferToFloatArray(Base64.getDecoder().decode(dataBase64), false);
}
}
100 changes: 66 additions & 34 deletions android/src/main/java/com/rnwhisper/RNWhisper.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.util.Random;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.PushbackInputStream;

public class RNWhisper implements LifecycleEventListener {
Expand Down Expand Up @@ -119,44 +120,16 @@ protected void onPostExecute(Integer id) {
tasks.put(task, "initContext");
}

public void transcribeFile(double id, double jobId, String filePath, ReadableMap options, Promise promise) {
final WhisperContext context = contexts.get((int) id);
if (context == null) {
promise.reject("Context not found");
return;
}
if (context.isCapturing()) {
promise.reject("The context is in realtime transcribe mode");
return;
}
if (context.isTranscribing()) {
promise.reject("Context is already transcribing");
return;
}
private AsyncTask transcribe(WhisperContext context, double jobId, final float[] audioData, final ReadableMap options, Promise promise) {
AsyncTask task = new AsyncTask<Void, Void, WritableMap>() {
private Exception exception;

@Override
protected WritableMap doInBackground(Void... voids) {
try {
String waveFilePath = filePath;

if (filePath.startsWith("http://") || filePath.startsWith("https://")) {
waveFilePath = downloader.downloadFile(filePath);
}

int resId = getResourceIdentifier(waveFilePath);
if (resId > 0) {
return context.transcribeInputStream(
(int) jobId,
reactContext.getResources().openRawResource(resId),
options
);
}

return context.transcribeInputStream(
return context.transcribe(
(int) jobId,
new FileInputStream(new File(waveFilePath)),
audioData,
options
);
} catch (Exception e) {
Expand All @@ -175,7 +148,66 @@ protected void onPostExecute(WritableMap data) {
tasks.remove(this);
}
}.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
tasks.put(task, "transcribeFile-" + id);
return task;
}

public void transcribeFile(double id, double jobId, String filePathOrBase64, ReadableMap options, Promise promise) {
final WhisperContext context = contexts.get((int) id);
if (context == null) {
promise.reject("Context not found");
return;
}
if (context.isCapturing()) {
promise.reject("The context is in realtime transcribe mode");
return;
}
if (context.isTranscribing()) {
promise.reject("Context is already transcribing");
return;
}

String waveFilePath = filePathOrBase64;
try {
if (filePathOrBase64.startsWith("http://") || filePathOrBase64.startsWith("https://")) {
waveFilePath = downloader.downloadFile(filePathOrBase64);
}

float[] audioData;
int resId = getResourceIdentifier(waveFilePath);
if (resId > 0) {
audioData = AudioUtils.decodeWaveFile(reactContext.getResources().openRawResource(resId));
} else if (filePathOrBase64.startsWith("data:audio/wav;base64,")) {
audioData = AudioUtils.decodeWaveData(filePathOrBase64);
} else {
audioData = AudioUtils.decodeWaveFile(new FileInputStream(new File(waveFilePath)));
}

AsyncTask task = transcribe(context, jobId, audioData, options, promise);
tasks.put(task, "transcribeFile-" + id);
} catch (Exception e) {
promise.reject(e);
}
}

public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) {
final WhisperContext context = contexts.get((int) id);
if (context == null) {
promise.reject("Context not found");
return;
}
if (context.isCapturing()) {
promise.reject("The context is in realtime transcribe mode");
return;
}
if (context.isTranscribing()) {
promise.reject("Context is already transcribing");
return;
}

float[] audioData = AudioUtils.decodePcmData(dataBase64);
AsyncTask task = transcribe(context, jobId, audioData, options, promise);

tasks.put(task, "transcribeData-" + id);
}

public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) {
Expand Down Expand Up @@ -211,7 +243,7 @@ protected Void doInBackground(Void... voids) {
context.stopTranscribe((int) jobId);
AsyncTask completionTask = null;
for (AsyncTask task : tasks.keySet()) {
if (tasks.get(task).equals("transcribeFile-" + id)) {
if (tasks.get(task).equals("transcribeFile-" + id) || tasks.get(task).equals("transcribeData-" + id)) {
task.get();
break;
}
Expand Down Expand Up @@ -259,7 +291,7 @@ protected Void doInBackground(Void... voids) {
context.stopCurrentTranscribe();
AsyncTask completionTask = null;
for (AsyncTask task : tasks.keySet()) {
if (tasks.get(task).equals("transcribeFile-" + contextId)) {
if (tasks.get(task).equals("transcribeFile-" + contextId) || tasks.get(task).equals("transcribeData-" + contextId)) {
task.get();
break;
}
Expand Down
3 changes: 1 addition & 2 deletions android/src/main/java/com/rnwhisper/WhisperContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ void onNewSegments(int nNew) {
}
}

public WritableMap transcribeInputStream(int jobId, InputStream inputStream, ReadableMap options) throws IOException, Exception {
public WritableMap transcribe(int jobId, float[] audioData, ReadableMap options) throws IOException, Exception {
if (isCapturing || isTranscribing) {
throw new Exception("Context is already in capturing or transcribing");
}
Expand All @@ -341,7 +341,6 @@ public WritableMap transcribeInputStream(int jobId, InputStream inputStream, Rea
this.isTdrzEnable = options.hasKey("tdrzEnable") && options.getBoolean("tdrzEnable");

isTranscribing = true;
float[] audioData = AudioUtils.decodeWaveFile(inputStream);

boolean hasProgressCallback = options.hasKey("onProgress") && options.getBoolean("onProgress");
boolean hasNewSegmentsCallback = options.hasKey("onNewSegments") && options.getBoolean("onNewSegments");
Expand Down
5 changes: 5 additions & 0 deletions android/src/newarch/java/com/rnwhisper/RNWhisperModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ public void transcribeFile(double id, double jobId, String filePath, ReadableMap
rnwhisper.transcribeFile(id, jobId, filePath, options, promise);
}

@ReactMethod
public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) {
rnwhisper.transcribeData(id, jobId, dataBase64, options, promise);
}

@ReactMethod
public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) {
rnwhisper.startRealtimeTranscribe(id, jobId, options, promise);
Expand Down
5 changes: 5 additions & 0 deletions android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ public void transcribeFile(double id, double jobId, String filePath, ReadableMap
rnwhisper.transcribeFile(id, jobId, filePath, options, promise);
}

@ReactMethod
public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) {
rnwhisper.transcribeData(id, jobId, dataBase64, options, promise);
}

@ReactMethod
public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) {
rnwhisper.startRealtimeTranscribe(id, jobId, options, promise);
Expand Down
12 changes: 9 additions & 3 deletions example/ios/Podfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,8 @@ PODS:
- React-logger (= 0.74.6)
- React-perflogger (= 0.74.6)
- React-utils (= 0.74.6)
- RNAudioPcmStream (1.1.4):
- React
- RNCClipboard (1.14.3):
- DoubleConversion
- glog
Expand Down Expand Up @@ -1404,6 +1406,7 @@ DEPENDENCIES:
- React-runtimescheduler (from `../node_modules/react-native/ReactCommon/react/renderer/runtimescheduler`)
- React-utils (from `../node_modules/react-native/ReactCommon/react/utils`)
- ReactCommon/turbomodule/core (from `../node_modules/react-native/ReactCommon`)
- "RNAudioPcmStream (from `../node_modules/@fugood/react-native-audio-pcm-stream`)"
- "RNCClipboard (from `../node_modules/@react-native-clipboard/clipboard`)"
- RNFS (from `../node_modules/react-native-fs`)
- RNGestureHandler (from `../node_modules/react-native-gesture-handler`)
Expand Down Expand Up @@ -1528,6 +1531,8 @@ EXTERNAL SOURCES:
:path: "../node_modules/react-native/ReactCommon/react/utils"
ReactCommon:
:path: "../node_modules/react-native/ReactCommon"
RNAudioPcmStream:
:path: "../node_modules/@fugood/react-native-audio-pcm-stream"
RNCClipboard:
:path: "../node_modules/@react-native-clipboard/clipboard"
RNFS:
Expand All @@ -1547,10 +1552,10 @@ EXTERNAL SOURCES:

SPEC CHECKSUMS:
boost: d3f49c53809116a5d38da093a8aa78bf551aed09
DoubleConversion: 76ab83afb40bddeeee456813d9c04f67f78771b5
DoubleConversion: fea03f2699887d960129cc54bba7e52542b6f953
FBLazyVector: 4b1589d37c9ff4dba11a63083fe7515fad3ac111
fmt: 4c2741a687cc09f0634a2e2c72a838b99f1ff120
glog: fdfdfe5479092de0c4bdbebedd9056951f092c4f
glog: c5d68082e772fa1c511173d6b30a9de2c05a69a2
hermes-engine: 2102c92e54a031a270fd1fe84169ec8a0901b7bd
RCT-Folly: 02617c592a293bd6d418e0a88ff4ee1f88329b47
RCTDeprecation: 5f1d7e1f8ef6c53f0207e3ac0d0ca23575e8a6ab
Expand Down Expand Up @@ -1600,6 +1605,7 @@ SPEC CHECKSUMS:
React-runtimescheduler: b63ebebd3e000e0ba4ac19ca69bdac071559ad57
React-utils: 2955bdc1b2ed495f14dc7d3bfbbb7e3624cfc0fc
ReactCommon: 5c504a77030c7ab89eee75b1725b80d8cee7f5d7
RNAudioPcmStream: d7491fdfe6bddcebd6ab325df8327014be16743f
RNCClipboard: 99d86f515e6262a8a1d0915f1f6e6b410698aa3a
RNFS: 4ac0f0ea233904cb798630b3c077808c06931688
RNGestureHandler: e723a54dfedabf2a6be36bbcb6c7d5c96de8a379
Expand All @@ -1609,7 +1615,7 @@ SPEC CHECKSUMS:
SocketRocket: abac6f5de4d4d62d24e11868d7a2f427e0ef940d
SSZipArchive: c69881e8ac5521f0e622291387add5f60f30f3c4
whisper-rn: bbc8e83316260f5b4882a3c3f836b574649cc9e3
Yoga: 2655d94606b547901976080f15cdc6408575f699
Yoga: 4f4f07a17818e76d1b04edc01b68b6d49a682100

PODFILE CHECKSUM: b3b0da61591aeebfbbfe372a869068f21e963e16

Expand Down
5 changes: 4 additions & 1 deletion example/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
"version": "0.0.1",
"private": true,
"scripts": {
"postinstall": "patch-package",
"android": "react-native run-android",
"ios": "react-native run-ios",
"start": "react-native start",
"pods": "pod-install --quiet"
},
"dependencies": {
"@fugood/react-native-audio-pcm-stream": "^1.1.4",
"@react-native-clipboard/clipboard": "^1.14.3",
"@react-native/babel-preset": "0.74.88",
"@react-native/metro-config": "^0.74.79",
Expand All @@ -26,6 +28,7 @@
"devDependencies": {
"@babel/core": "^7.23.9",
"@babel/preset-env": "^7.24.4",
"@babel/runtime": "^7.23.9"
"@babel/runtime": "^7.23.9",
"patch-package": "^8.0.0"
}
}
10 changes: 10 additions & 0 deletions example/patches/@fugood+react-native-audio-pcm-stream+1.1.4.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
diff --git a/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts b/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts
index b846438..9e180cd 100644
--- a/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts
+++ b/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts
@@ -1,4 +1,4 @@
-declare module "react-native-live-audio-stream" {
+declare module "@fugood/react-native-audio-pcm-stream" {
export interface IAudioRecord {
init: (options: Options) => void
start: () => void
10 changes: 9 additions & 1 deletion example/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { enableScreens } from 'react-native-screens'
import { NavigationContainer } from '@react-navigation/native'
import { createNativeStackNavigator } from '@react-navigation/native-stack'
import Transcribe from './Transcribe'
import TranscribeData from './TranscribeData'
import Bench from './Bench'

enableScreens()
Expand Down Expand Up @@ -37,7 +38,13 @@ function HomeScreen({ navigation }: { navigation: any }) {
style={styles.button}
onPress={() => navigation.navigate('Transcribe')}
>
<Text style={styles.buttonText}>Transcribe Examples</Text>
<Text style={styles.buttonText}>Example: Transcribe File / Realtime</Text>
</TouchableOpacity>
<TouchableOpacity
style={styles.button}
onPress={() => navigation.navigate('TranscribeData')}
>
<Text style={styles.buttonText}>Example: Transcribe Data</Text>
</TouchableOpacity>
<TouchableOpacity
style={styles.button}
Expand All @@ -58,6 +65,7 @@ function App() {
<Stack.Navigator>
<Stack.Screen name="Home" component={HomeScreen} />
<Stack.Screen name="Transcribe" component={Transcribe} />
<Stack.Screen name="TranscribeData" component={TranscribeData} />
<Stack.Screen name="Bench" component={Bench} />
</Stack.Navigator>
</NavigationContainer>
Expand Down
Loading

0 comments on commit 06ecd04

Please sign in to comment.