-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathtranscriber.py
89 lines (71 loc) · 2.19 KB
/
transcriber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
Speech-to-text transcriptiong service based on OpenAI Whisper.
From Modal's quillman
"""
import tempfile
import time
from modal import Image, method
from common import stub
MODEL_NAME = "base.en"
def download_model():
import whisper
whisper.load_model(MODEL_NAME)
transcriber_image = (
Image.debian_slim(python_version="3.10.8")
.apt_install("git", "ffmpeg")
.pip_install(
"https://github.com/openai/whisper/archive/v20230314.tar.gz",
"ffmpeg-python",
)
.run_function(download_model)
)
def load_audio(data: bytes, sr: int = 16000):
import ffmpeg
import numpy as np
try:
fp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
fp.write(data)
fp.close()
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(
fp.name,
threads=0,
format="f32le",
acodec="pcm_f32le",
ac=1,
ar="48k",
)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(
cmd=["ffmpeg", "-nostdin"],
capture_stdout=True,
capture_stderr=True,
)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.float32).flatten()
@stub.cls(
gpu="A100",
container_idle_timeout=180,
image=transcriber_image,
)
class Whisper:
def __enter__(self):
import torch
import whisper
self.use_gpu = torch.cuda.is_available()
device = "cuda" if self.use_gpu else "cpu"
self.model = whisper.load_model(MODEL_NAME, device=device)
@method()
def transcribe_segment(
self,
audio_data: bytes,
):
t0 = time.time()
np_array = load_audio(audio_data)
result = self.model.transcribe(np_array, language="en", fp16=self.use_gpu) # type: ignore
print(f"Transcribed in {time.time() - t0:.2f}s")
return result