forked from LAION-AI/Desktop_BUD-E
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stream_asr.py
110 lines (81 loc) · 3.22 KB
/
stream_asr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from api_configs.configs import *
import asyncio
import requests
# Import Deepgram components
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
asr_config = get_asr_config()
asr_api = asr_config["default_api"]
asr_model = asr_config["apis"][asr_api]["model"]
asr_sample_rate = asr_config["apis"][asr_api]["sample_rate"]
asr_language = asr_config["apis"][asr_api]["language"]
# Define TranscriptCollector class
class TranscriptCollector:
def __init__(self):
self.reset()
def reset(self):
# Reset transcript parts
self.transcript_parts = []
def add_part(self, part):
# Add a part to the transcript
self.transcript_parts.append(part)
def get_full_transcript(self):
# Get the full transcript
return ' '.join(self.transcript_parts)
# Create a global transcript collector instance
transcript_collector = TranscriptCollector()
if asr_api == "deepgram":
async def get_transcript(callback):
transcription_complete = asyncio.Event() # Event to signal transcription completion
try:
# Set up Deepgram client
config = DeepgramClientOptions(options={"keepalive": "true"})
deepgram: DeepgramClient = DeepgramClient("", config)
dg_connection = deepgram.listen.asynclive.v("1")
print("Listening...")
async def on_message(self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if not result.speech_final:
transcript_collector.add_part(sentence)
else:
# This is the final part of the current sentence
transcript_collector.add_part(sentence)
full_sentence = transcript_collector.get_full_transcript()
if len(full_sentence.strip()) > 0:
full_sentence = full_sentence.strip()
print(f"Human: {full_sentence}")
callback(full_sentence) # Call the callback with the full_sentence
transcript_collector.reset()
transcription_complete.set() # Signal to stop transcription and exit
# Set up Deepgram connection event handler
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
# Set up Deepgram live options
options = LiveOptions(
model=asr_model,
punctuate=True,
language=asr_language, #de
encoding="linear16",
channels=1,
sample_rate=asr_sample_rate,
endpointing=300,
smart_format=True
)
# Start Deepgram connection
await dg_connection.start(options)
# Open a microphone stream on the default input device
microphone = Microphone(dg_connection.send)
microphone.start()
# Wait for the transcription to complete
await transcription_complete.wait()
# Wait for the microphone to close
microphone.finish()
# Indicate that we've finished
await dg_connection.finish()
except Exception as e:
print(f"Could not open socket: {e}")
return