-
Notifications
You must be signed in to change notification settings - Fork 1
/
remy.py
209 lines (174 loc) · 7.33 KB
/
remy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from flask import Flask
from flask_socketio import SocketIO
import queue
import threading
import os
from openai import OpenAI
import speech_recognition as sr
from dotenv import load_dotenv
from datetime import datetime
from audio import AudioPlayer
from video import VideoPlayer
from robot import move_robot
from flask_cors import CORS
class Remy():
def __init__(self) -> None:
# Flask and SocketIO initialization
self.app = Flask(__name__)
CORS(self.app, origins="*") # Allow only this origin
self.socketio = SocketIO(self.app, cors_allowed_origins="*")
# Register Socket.IO event handlers
self.register_socketio_events()
# Other initialization
self.context = []
self.command_queue = queue.Queue()
self.audio_queue = queue.Queue()
self.stop_event = threading.Event()
self.question_event = threading.Event()
self.audio_player = AudioPlayer()
self.video_player = VideoPlayer()
self.handler_thread = None
self.listener_thread = None
self.transcribe_thread = None
self.conversation = []
# Load environment variables from .env file
load_dotenv()
# Get OpenAI API key from environment variables
self.api_key = os.getenv('OPENAI_API_KEY')
# Initialize OpenAI client
self.client = OpenAI(api_key=self.api_key)
# Start the command handler, listener, and transcribe threads
self.start()
# Start the Flask-SocketIO server
self.run_socketio_server()
def register_socketio_events(self):
@self.socketio.on('connect')
def handle_connect():
print("Client connected")
@self.socketio.on('disconnect')
def handle_disconnect():
print("Client disconnected")
@self.socketio.on('receive')
def handle_receive(data):
self.socketio.emit('receive', self.conversation)
def run_socketio_server(self):
# Start the Flask-SocketIO server in a separate thread
server_thread = threading.Thread(target=self.socketio.run, args=(self.app,), kwargs={'host': '0.0.0.0', 'port': 5500})
server_thread.start()
def respondToCommand(self, response: str) -> None:
time = None
print("response:", response)
audio_path = self.text_to_audio(response)
if audio_path:
time = self.audio_player.get_audio_length(audio_path)
self.audio_player.play(audio_path, should_delete=True)
move_robot(time)
def sendCommand(self, command: str) -> None:
print("command:", command)
photo = self.video_player.capture_frame_as_base64()
response = self._remy_gpt(" ".join(self.context), command)
new_response = {
"img": photo,
"question": command,
"answer": response
}
self.socketio.emit('receive', [new_response])
self.conversation.append(new_response)
self.context.append("Client: " + command)
self.context.append("Remy: " + response)
self.respondToCommand(response)
def command_handler(self):
while True:
command = self.command_queue.get()
if command is None:
print("Received stop signal.")
break
if command:
self.sendCommand(command)
def listen_audio(self):
recognizer = sr.Recognizer()
microphone = sr.Microphone()
with microphone as source:
print("Adjusting for ambient noise. Please wait...")
recognizer.adjust_for_ambient_noise(source, duration=1)
print("Listening for the phrase...")
while True:
try:
audio = recognizer.listen(source, phrase_time_limit=5)
self.audio_queue.put(audio.get_wav_data())
except sr.UnknownValueError:
print("Could not understand audio")
except KeyboardInterrupt:
print("Stopping...")
self.audio_queue.put(None)
self.transcribe_thread.join()
break
def transcribe_audio(self):
while True:
audio_data = self.audio_queue.get()
if audio_data is None:
break
try:
with open("temp_audio.wav", "wb") as f:
f.write(audio_data)
with open("temp_audio.wav", "rb") as audio_file:
response = self.client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
text = response.text.lower()
print(text)
if (self.question_event.is_set()):
self.question_event.clear()
self.command_queue.put(text)
if "what's up" in text:
self.question_event.set()
self.audio_player.play("./chime.mp3")
os.remove("temp_audio.wav")
except Exception as e:
print(f"An error occurred: {str(e)}")
def text_to_audio(self, text, subfolder="generated_audio"):
subfolder_path = "./" + subfolder
if not os.path.exists(subfolder_path):
os.makedirs(subfolder_path)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
speech_file_path = subfolder_path + "/" + f"remy_gpt_output_audio_{timestamp}.mp3"
response = self.client.audio.speech.create(
model="tts-1",
voice="echo",
input=text
)
response.stream_to_file(speech_file_path)
return speech_file_path
def start(self):
self.handler_thread = threading.Thread(target=self.command_handler)
self.listener_thread = threading.Thread(target=self.listen_audio)
self.transcribe_thread = threading.Thread(target=self.transcribe_audio)
self.handler_thread.start()
self.listener_thread.start()
self.transcribe_thread.start()
def stop(self):
print("Stopping...")
self.command_queue.put(None)
self.audio_queue.put(None)
self.listener_thread.join()
self.transcribe_thread.join()
self.handler_thread.join()
def add(self, command):
self.command_queue.put(command)
def _remy_gpt(self, context, text):
response = self.client.chat.completions.create(
model="ft:gpt-3.5-turbo-1106:personal:remy:A7TF2xZK",
messages=[
{"role": "system", "content": "You are Remy the rat from Ratatouille. Guide users through this recipe: Smash 1 cucumber and cut into bite-sized pieces. Mix 1 teaspoon salt, 2 teaspoons sugar, 1 teaspoon sesame oil, 2 teaspoons soy sauce, and 1 tablespoon rice vinegar to make dressing. Toss cucumber with dressing, 3 chopped garlic cloves, and 1 teaspoon chili oil. Garnish with 1 tsp sesame seeds and cilantro. with step by step with concise responses."},
{"role": "user", "content": "context: " + context + ". This is the new question I am asking: " + text}
],
max_tokens=150
)
return response.choices[0].message.content.strip()
if __name__ == '__main__':
remy = None
try:
remy = Remy()
except KeyboardInterrupt:
remy.stop()