diff --git a/android/src/main/java/com/rnwhisper/AudioUtils.java b/android/src/main/java/com/rnwhisper/AudioUtils.java index b6c614d..dab2d02 100644 --- a/android/src/main/java/com/rnwhisper/AudioUtils.java +++ b/android/src/main/java/com/rnwhisper/AudioUtils.java @@ -2,8 +2,6 @@ import android.util.Log; -import java.io.IOException; -import java.io.FileReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; @@ -11,23 +9,22 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.ShortBuffer; +import java.util.Base64; + +import java.util.Arrays; public class AudioUtils { private static final String NAME = "RNWhisperAudioUtils"; - public static float[] decodeWaveFile(InputStream inputStream) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - byte[] buffer = new byte[1024]; - int bytesRead; - while ((bytesRead = inputStream.read(buffer)) != -1) { - baos.write(buffer, 0, bytesRead); - } - ByteBuffer byteBuffer = ByteBuffer.wrap(baos.toByteArray()); + private static float[] bufferToFloatArray(byte[] buffer, Boolean cutHeader) { + ByteBuffer byteBuffer = ByteBuffer.wrap(buffer); byteBuffer.order(ByteOrder.LITTLE_ENDIAN); - byteBuffer.position(44); ShortBuffer shortBuffer = byteBuffer.asShortBuffer(); short[] shortArray = new short[shortBuffer.limit()]; shortBuffer.get(shortArray); + if (cutHeader) { + shortArray = Arrays.copyOfRange(shortArray, 44, shortArray.length); + } float[] floatArray = new float[shortArray.length]; for (int i = 0; i < shortArray.length; i++) { floatArray[i] = ((float) shortArray[i]) / 32767.0f; @@ -36,4 +33,22 @@ public static float[] decodeWaveFile(InputStream inputStream) throws IOException } return floatArray; } -} \ No newline at end of file + + public static float[] decodeWaveFile(InputStream inputStream) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + int bytesRead; + while ((bytesRead = inputStream.read(buffer)) != -1) { + baos.write(buffer, 0, bytesRead); + } + return bufferToFloatArray(baos.toByteArray(), true); + } + + public static float[] decodeWaveData(String dataBase64) throws IOException { + return bufferToFloatArray(Base64.getDecoder().decode(dataBase64), true); + } + + public static float[] decodePcmData(String dataBase64) { + return bufferToFloatArray(Base64.getDecoder().decode(dataBase64), false); + } +} diff --git a/android/src/main/java/com/rnwhisper/RNWhisper.java b/android/src/main/java/com/rnwhisper/RNWhisper.java index e04a5d9..447fa67 100644 --- a/android/src/main/java/com/rnwhisper/RNWhisper.java +++ b/android/src/main/java/com/rnwhisper/RNWhisper.java @@ -19,6 +19,7 @@ import java.util.Random; import java.io.File; import java.io.FileInputStream; +import java.io.InputStream; import java.io.PushbackInputStream; public class RNWhisper implements LifecycleEventListener { @@ -119,44 +120,16 @@ protected void onPostExecute(Integer id) { tasks.put(task, "initContext"); } - public void transcribeFile(double id, double jobId, String filePath, ReadableMap options, Promise promise) { - final WhisperContext context = contexts.get((int) id); - if (context == null) { - promise.reject("Context not found"); - return; - } - if (context.isCapturing()) { - promise.reject("The context is in realtime transcribe mode"); - return; - } - if (context.isTranscribing()) { - promise.reject("Context is already transcribing"); - return; - } + private AsyncTask transcribe(WhisperContext context, double jobId, final float[] audioData, final ReadableMap options, Promise promise) { AsyncTask task = new AsyncTask() { private Exception exception; @Override protected WritableMap doInBackground(Void... voids) { try { - String waveFilePath = filePath; - - if (filePath.startsWith("http://") || filePath.startsWith("https://")) { - waveFilePath = downloader.downloadFile(filePath); - } - - int resId = getResourceIdentifier(waveFilePath); - if (resId > 0) { - return context.transcribeInputStream( - (int) jobId, - reactContext.getResources().openRawResource(resId), - options - ); - } - - return context.transcribeInputStream( + return context.transcribe( (int) jobId, - new FileInputStream(new File(waveFilePath)), + audioData, options ); } catch (Exception e) { @@ -175,7 +148,66 @@ protected void onPostExecute(WritableMap data) { tasks.remove(this); } }.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR); - tasks.put(task, "transcribeFile-" + id); + return task; + } + + public void transcribeFile(double id, double jobId, String filePathOrBase64, ReadableMap options, Promise promise) { + final WhisperContext context = contexts.get((int) id); + if (context == null) { + promise.reject("Context not found"); + return; + } + if (context.isCapturing()) { + promise.reject("The context is in realtime transcribe mode"); + return; + } + if (context.isTranscribing()) { + promise.reject("Context is already transcribing"); + return; + } + + String waveFilePath = filePathOrBase64; + try { + if (filePathOrBase64.startsWith("http://") || filePathOrBase64.startsWith("https://")) { + waveFilePath = downloader.downloadFile(filePathOrBase64); + } + + float[] audioData; + int resId = getResourceIdentifier(waveFilePath); + if (resId > 0) { + audioData = AudioUtils.decodeWaveFile(reactContext.getResources().openRawResource(resId)); + } else if (filePathOrBase64.startsWith("data:audio/wav;base64,")) { + audioData = AudioUtils.decodeWaveData(filePathOrBase64); + } else { + audioData = AudioUtils.decodeWaveFile(new FileInputStream(new File(waveFilePath))); + } + + AsyncTask task = transcribe(context, jobId, audioData, options, promise); + tasks.put(task, "transcribeFile-" + id); + } catch (Exception e) { + promise.reject(e); + } + } + + public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) { + final WhisperContext context = contexts.get((int) id); + if (context == null) { + promise.reject("Context not found"); + return; + } + if (context.isCapturing()) { + promise.reject("The context is in realtime transcribe mode"); + return; + } + if (context.isTranscribing()) { + promise.reject("Context is already transcribing"); + return; + } + + float[] audioData = AudioUtils.decodePcmData(dataBase64); + AsyncTask task = transcribe(context, jobId, audioData, options, promise); + + tasks.put(task, "transcribeData-" + id); } public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) { @@ -211,7 +243,7 @@ protected Void doInBackground(Void... voids) { context.stopTranscribe((int) jobId); AsyncTask completionTask = null; for (AsyncTask task : tasks.keySet()) { - if (tasks.get(task).equals("transcribeFile-" + id)) { + if (tasks.get(task).equals("transcribeFile-" + id) || tasks.get(task).equals("transcribeData-" + id)) { task.get(); break; } @@ -259,7 +291,7 @@ protected Void doInBackground(Void... voids) { context.stopCurrentTranscribe(); AsyncTask completionTask = null; for (AsyncTask task : tasks.keySet()) { - if (tasks.get(task).equals("transcribeFile-" + contextId)) { + if (tasks.get(task).equals("transcribeFile-" + contextId) || tasks.get(task).equals("transcribeData-" + contextId)) { task.get(); break; } diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java index 0b5b2be..f3508af 100644 --- a/android/src/main/java/com/rnwhisper/WhisperContext.java +++ b/android/src/main/java/com/rnwhisper/WhisperContext.java @@ -332,7 +332,7 @@ void onNewSegments(int nNew) { } } - public WritableMap transcribeInputStream(int jobId, InputStream inputStream, ReadableMap options) throws IOException, Exception { + public WritableMap transcribe(int jobId, float[] audioData, ReadableMap options) throws IOException, Exception { if (isCapturing || isTranscribing) { throw new Exception("Context is already in capturing or transcribing"); } @@ -341,7 +341,6 @@ public WritableMap transcribeInputStream(int jobId, InputStream inputStream, Rea this.isTdrzEnable = options.hasKey("tdrzEnable") && options.getBoolean("tdrzEnable"); isTranscribing = true; - float[] audioData = AudioUtils.decodeWaveFile(inputStream); boolean hasProgressCallback = options.hasKey("onProgress") && options.getBoolean("onProgress"); boolean hasNewSegmentsCallback = options.hasKey("onNewSegments") && options.getBoolean("onNewSegments"); diff --git a/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java b/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java index bdf6972..a901d9f 100644 --- a/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +++ b/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java @@ -47,6 +47,11 @@ public void transcribeFile(double id, double jobId, String filePath, ReadableMap rnwhisper.transcribeFile(id, jobId, filePath, options, promise); } + @ReactMethod + public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) { + rnwhisper.transcribeData(id, jobId, dataBase64, options, promise); + } + @ReactMethod public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) { rnwhisper.startRealtimeTranscribe(id, jobId, options, promise); diff --git a/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java b/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java index e0f37c7..aba2463 100644 --- a/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +++ b/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java @@ -47,6 +47,11 @@ public void transcribeFile(double id, double jobId, String filePath, ReadableMap rnwhisper.transcribeFile(id, jobId, filePath, options, promise); } + @ReactMethod + public void transcribeData(double id, double jobId, String dataBase64, ReadableMap options, Promise promise) { + rnwhisper.transcribeData(id, jobId, dataBase64, options, promise); + } + @ReactMethod public void startRealtimeTranscribe(double id, double jobId, ReadableMap options, Promise promise) { rnwhisper.startRealtimeTranscribe(id, jobId, options, promise); diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 0006226..f6fcf00 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -1230,6 +1230,8 @@ PODS: - React-logger (= 0.74.6) - React-perflogger (= 0.74.6) - React-utils (= 0.74.6) + - RNAudioPcmStream (1.1.4): + - React - RNCClipboard (1.14.3): - DoubleConversion - glog @@ -1404,6 +1406,7 @@ DEPENDENCIES: - React-runtimescheduler (from `../node_modules/react-native/ReactCommon/react/renderer/runtimescheduler`) - React-utils (from `../node_modules/react-native/ReactCommon/react/utils`) - ReactCommon/turbomodule/core (from `../node_modules/react-native/ReactCommon`) + - "RNAudioPcmStream (from `../node_modules/@fugood/react-native-audio-pcm-stream`)" - "RNCClipboard (from `../node_modules/@react-native-clipboard/clipboard`)" - RNFS (from `../node_modules/react-native-fs`) - RNGestureHandler (from `../node_modules/react-native-gesture-handler`) @@ -1528,6 +1531,8 @@ EXTERNAL SOURCES: :path: "../node_modules/react-native/ReactCommon/react/utils" ReactCommon: :path: "../node_modules/react-native/ReactCommon" + RNAudioPcmStream: + :path: "../node_modules/@fugood/react-native-audio-pcm-stream" RNCClipboard: :path: "../node_modules/@react-native-clipboard/clipboard" RNFS: @@ -1547,10 +1552,10 @@ EXTERNAL SOURCES: SPEC CHECKSUMS: boost: d3f49c53809116a5d38da093a8aa78bf551aed09 - DoubleConversion: 76ab83afb40bddeeee456813d9c04f67f78771b5 + DoubleConversion: fea03f2699887d960129cc54bba7e52542b6f953 FBLazyVector: 4b1589d37c9ff4dba11a63083fe7515fad3ac111 fmt: 4c2741a687cc09f0634a2e2c72a838b99f1ff120 - glog: fdfdfe5479092de0c4bdbebedd9056951f092c4f + glog: c5d68082e772fa1c511173d6b30a9de2c05a69a2 hermes-engine: 2102c92e54a031a270fd1fe84169ec8a0901b7bd RCT-Folly: 02617c592a293bd6d418e0a88ff4ee1f88329b47 RCTDeprecation: 5f1d7e1f8ef6c53f0207e3ac0d0ca23575e8a6ab @@ -1600,6 +1605,7 @@ SPEC CHECKSUMS: React-runtimescheduler: b63ebebd3e000e0ba4ac19ca69bdac071559ad57 React-utils: 2955bdc1b2ed495f14dc7d3bfbbb7e3624cfc0fc ReactCommon: 5c504a77030c7ab89eee75b1725b80d8cee7f5d7 + RNAudioPcmStream: d7491fdfe6bddcebd6ab325df8327014be16743f RNCClipboard: 99d86f515e6262a8a1d0915f1f6e6b410698aa3a RNFS: 4ac0f0ea233904cb798630b3c077808c06931688 RNGestureHandler: e723a54dfedabf2a6be36bbcb6c7d5c96de8a379 @@ -1609,7 +1615,7 @@ SPEC CHECKSUMS: SocketRocket: abac6f5de4d4d62d24e11868d7a2f427e0ef940d SSZipArchive: c69881e8ac5521f0e622291387add5f60f30f3c4 whisper-rn: bbc8e83316260f5b4882a3c3f836b574649cc9e3 - Yoga: 2655d94606b547901976080f15cdc6408575f699 + Yoga: 4f4f07a17818e76d1b04edc01b68b6d49a682100 PODFILE CHECKSUM: b3b0da61591aeebfbbfe372a869068f21e963e16 diff --git a/example/package.json b/example/package.json index 5bf148a..18c4d35 100644 --- a/example/package.json +++ b/example/package.json @@ -3,12 +3,14 @@ "version": "0.0.1", "private": true, "scripts": { + "postinstall": "patch-package", "android": "react-native run-android", "ios": "react-native run-ios", "start": "react-native start", "pods": "pod-install --quiet" }, "dependencies": { + "@fugood/react-native-audio-pcm-stream": "^1.1.4", "@react-native-clipboard/clipboard": "^1.14.3", "@react-native/babel-preset": "0.74.88", "@react-native/metro-config": "^0.74.79", @@ -26,6 +28,7 @@ "devDependencies": { "@babel/core": "^7.23.9", "@babel/preset-env": "^7.24.4", - "@babel/runtime": "^7.23.9" + "@babel/runtime": "^7.23.9", + "patch-package": "^8.0.0" } } diff --git a/example/patches/@fugood+react-native-audio-pcm-stream+1.1.4.patch b/example/patches/@fugood+react-native-audio-pcm-stream+1.1.4.patch new file mode 100644 index 0000000..e7ecf48 --- /dev/null +++ b/example/patches/@fugood+react-native-audio-pcm-stream+1.1.4.patch @@ -0,0 +1,10 @@ +diff --git a/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts b/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts +index b846438..9e180cd 100644 +--- a/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts ++++ b/node_modules/@fugood/react-native-audio-pcm-stream/index.d.ts +@@ -1,4 +1,4 @@ +-declare module "react-native-live-audio-stream" { ++declare module "@fugood/react-native-audio-pcm-stream" { + export interface IAudioRecord { + init: (options: Options) => void + start: () => void diff --git a/example/src/App.tsx b/example/src/App.tsx index a2bbf55..e2feecc 100644 --- a/example/src/App.tsx +++ b/example/src/App.tsx @@ -8,6 +8,7 @@ import { enableScreens } from 'react-native-screens' import { NavigationContainer } from '@react-navigation/native' import { createNativeStackNavigator } from '@react-navigation/native-stack' import Transcribe from './Transcribe' +import TranscribeData from './TranscribeData' import Bench from './Bench' enableScreens() @@ -37,7 +38,13 @@ function HomeScreen({ navigation }: { navigation: any }) { style={styles.button} onPress={() => navigation.navigate('Transcribe')} > - Transcribe Examples + Example: Transcribe File / Realtime + + navigation.navigate('TranscribeData')} + > + Example: Transcribe Data + diff --git a/example/src/Transcribe.tsx b/example/src/Transcribe.tsx index 9591291..6bba15f 100644 --- a/example/src/Transcribe.tsx +++ b/example/src/Transcribe.tsx @@ -14,7 +14,7 @@ import { initWhisper, libVersion, AudioSessionIos } from '../../src' // whisper. import type { WhisperContext } from '../../src' import { Button } from './Button' import contextOpts from './context-opts' -import { createDir, fileDir, modelHost } from './util' +import { createDir, fileDir, modelHost, toTimestamp } from './util' const sampleFile = require('../assets/jfk.wav') @@ -52,32 +52,10 @@ const styles = StyleSheet.create({ logText: { fontSize: 12, color: '#333' }, }) -function toTimestamp(t: number, comma = false) { - let msec = t * 10 - const hr = Math.floor(msec / (1000 * 60 * 60)) - msec -= hr * (1000 * 60 * 60) - const min = Math.floor(msec / (1000 * 60)) - msec -= min * (1000 * 60) - const sec = Math.floor(msec / 1000) - msec -= sec * 1000 - - const separator = comma ? ',' : '.' - const timestamp = `${String(hr).padStart(2, '0')}:${String(min).padStart( - 2, - '0', - )}:${String(sec).padStart(2, '0')}${separator}${String(msec).padStart( - 3, - '0', - )}` - - return timestamp -} - const mode = process.env.NODE_ENV === 'development' ? 'debug' : 'release' const recordFile = `${fileDir}/realtime.wav` - const filterPath = (path: string) => path.replace(RNFS.DocumentDirectoryPath, '') diff --git a/example/src/TranscribeData.tsx b/example/src/TranscribeData.tsx new file mode 100644 index 0000000..885ea03 --- /dev/null +++ b/example/src/TranscribeData.tsx @@ -0,0 +1,225 @@ +import React, { useCallback, useEffect, useRef, useState } from 'react' +import { StyleSheet, ScrollView, View, Text } from 'react-native' +import LiveAudioStream from '@fugood/react-native-audio-pcm-stream' +import { Buffer } from 'buffer' +import RNFS from 'react-native-fs' +import Sound from 'react-native-sound' +import { initWhisper, libVersion } from '../../src' +import type { WhisperContext } from '../../src' +import { Button } from './Button' +import contextOpts from './context-opts' +import { createDir, fileDir } from './util' + +const styles = StyleSheet.create({ + scrollview: { flexGrow: 1, justifyContent: 'center' }, + container: { + flex: 1, + alignItems: 'center', + justifyContent: 'center', + padding: 4, + }, + buttons: { flexDirection: 'row' }, + button: { margin: 4, backgroundColor: '#333', borderRadius: 4, padding: 8 }, + buttonClear: { backgroundColor: '#888' }, + buttonText: { fontSize: 14, color: 'white', textAlign: 'center' }, + logContainer: { + backgroundColor: 'lightgray', + padding: 8, + width: '95%', + borderRadius: 8, + marginVertical: 8, + }, + logText: { fontSize: 12, color: '#333' }, +}) + +const mode = process.env.NODE_ENV === 'development' ? 'debug' : 'release' +const recordFile = `${fileDir}/record.wav` + +const audioOptions = { + sampleRate: 16000, + channels: 1, + bitsPerSample: 16, + audioSource: 6, + wavFile: recordFile, + bufferSize: 16 * 1024, +} + +export default function TranscribeData() { + const whisperContextRef = useRef(null) + const whisperContext = whisperContextRef.current + const [logs, setLogs] = useState([`whisper.cpp version: ${libVersion}`]) + const [transcibeResult, setTranscibeResult] = useState(null) + const [isRecording, setIsRecording] = useState(false) + const recordedDataRef = useRef(null) + + const log = useCallback((...messages: any[]) => { + setLogs((prev) => [...prev, messages.join(' ')]) + }, []) + + useEffect( + () => () => { + whisperContextRef.current?.release() + whisperContextRef.current = null + }, + [], + ) + + const startRecording = async () => { + try { + await createDir(log) + recordedDataRef.current = null + + LiveAudioStream.init(audioOptions) + LiveAudioStream.on('data', (data: string) => { + if (!recordedDataRef.current) { + recordedDataRef.current = Buffer.from(data, 'base64') + } else { + recordedDataRef.current = Buffer.concat([ + recordedDataRef.current, + Buffer.from(data, 'base64'), + ]) + } + }) + + LiveAudioStream.start() + setIsRecording(true) + log('Started recording...') + } catch (error) { + log('Error starting recording:', error) + } + } + + const stopRecording = async () => { + try { + // Stop recording and get the wav file path + await LiveAudioStream.stop() + setIsRecording(false) + log('Stopped recording') + + if (!recordedDataRef.current) return log('No recorded data') + if (!whisperContext) return log('No context') + + // Read the wav file as base64 + const base64Data = recordedDataRef.current!.toString('base64') + log('Start transcribing...') + + const startTime = Date.now() + const { promise } = await whisperContext.transcribeData(base64Data, { + language: 'en', + onProgress: (progress) => { + log(`Transcribing progress: ${progress}%`) + }, + }) + const { result } = await promise + const endTime = Date.now() + + setTranscibeResult( + `Transcribed result: ${result}\n` + + `Transcribed in ${endTime - startTime}ms in ${mode} mode`, + ) + log('Finished transcribing') + } catch (error) { + log('Error stopping recording:', error) + } + } + + return ( + + + +