furushchev · zacharykratochvil · Apr 28, 2022 · Apr 28, 2022 · Jul 20, 2022 · Aug 27, 2022
diff --git a/launch/respeaker.launch b/launch/respeaker.launch
@@ -8,7 +8,7 @@
   <node name="speech_to_text" pkg="respeaker_ros" type="speech_to_text.py">
     <remap from="audio" to="speech_audio"/>
     <rosparam>
-      language: ja-JP
+      language: en-US
       self_cancellation: true
       tts_tolerance: 0.5
     </rosparam>

diff --git a/package.xml b/package.xml
@@ -11,6 +11,8 @@
   <depend condition="$ROS_PYTHON_VERSION == 3">catkin_virtualenv</depend>
   <depend>angles</depend>
   <depend>audio_common_msgs</depend>
+  <depend>audio_play</depend>
+  <depend>sound_play</depend>
   <depend>dynamic_reconfigure</depend>
   <depend>geometry_msgs</depend>
   <depend>portaudio19-dev</depend>

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 PyAudio==0.2.8
 SpeechRecognition==3.8.1
 click==6.7
-numpy==1.16.2
 pixel-ring==0.1.0
 pyusb==1.0.2
diff --git a/scripts/respeaker_node.py b/scripts/respeaker_node.py
@@ -7,6 +7,7 @@
 import usb.core
 import usb.util
 import pyaudio
+import wave
 import math
 import numpy as np
 import tf.transformations as T
@@ -15,20 +16,21 @@
 import struct
 import sys
 import time
+import speech_recognition as SR
 from audio_common_msgs.msg import AudioData
 from geometry_msgs.msg import PoseStamped
 from std_msgs.msg import Bool, Int32, ColorRGBA
 from dynamic_reconfigure.server import Server
 try:
     from pixel_ring import usb_pixel_ring_v2
 except IOError as e:
-    print e
+    print(e)
     raise RuntimeError("Check the device is connected and recognized")
 
 try:
     from respeaker_ros.cfg import RespeakerConfig
 except Exception as e:
-    print e
+    print(e)
     raise RuntimeError("Need to run respeaker_gencfg.py first")
 
 
@@ -169,12 +171,14 @@ def read(self, name):
             usb.util.CTRL_IN | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE,
             0, cmd, id, length, self.TIMEOUT)
 
-        response = struct.unpack(b'ii', response.tostring())
+        #response = struct.unpack(b'ii', struct.pack(b'ii',*response))
+        response = struct.unpack(b'ii', bytes([x for x in response]))
 
         if data[2] == 'int':
             result = response[0]
         else:
             result = response[0] * (2.**response[1])
+        #result = response[0]
 
         return result
 
@@ -221,16 +225,17 @@ def __init__(self, on_audio, channels=None, suppress_error=True):
         self.available_channels = None
         self.channels = channels
         self.device_index = None
-        self.rate = 16000
-        self.bitwidth = 2
+        self.rate = rospy.get_param("~sample_rate", 16000)
+        self.bitwidth = rospy.get_param("~sample_width", 2)
         self.bitdepth = 16
+        self.i = 0
 
         # find device
         count = self.pyaudio.get_device_count()
         rospy.logdebug("%d audio devices found" % count)
         for i in range(count):
             info = self.pyaudio.get_device_info_by_index(i)
-            name = info["name"].encode("utf-8")
+            name = str(info["name"].encode("utf-8"))
             chan = info["maxInputChannels"]
             rospy.logdebug(" - %d: %s" % (i, name))
             if name.lower().find("respeaker") >= 0:
@@ -266,6 +271,7 @@ def __init__(self, on_audio, channels=None, suppress_error=True):
             input_device_index=self.device_index,
         )
 
+
     def __del__(self):
         self.stop()
         try:
@@ -279,15 +285,19 @@ def __del__(self):
         except:
             pass
 
+
     def stream_callback(self, in_data, frame_count, time_info, status):
+
         # split channel
-        data = np.fromstring(in_data, dtype=np.int16)
-        chunk_per_channel = len(data) / self.available_channels
+        data = np.frombuffer(in_data, dtype=np.int16)
+        chunk_per_channel = np.math.ceil(len(data) / self.available_channels)
         data = np.reshape(data, (chunk_per_channel, self.available_channels))
         for chan in self.channels:
-            chan_data = data[:, chan]
+            chan_data = bytearray(data[:, chan].tobytes())
+
             # invoke callback
-            self.on_audio(chan_data.tostring(), chan)
+            self.on_audio(chan_data, chan)
+
         return None, pyaudio.paContinue
 
     def start(self):
@@ -315,7 +325,7 @@ def __init__(self):
         #
         self.respeaker = RespeakerInterface()
         self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error)
-        self.speech_audio_buffer = str()
+        self.speech_audio_buffer = bytearray()
         self.is_speeching = False
         self.speech_stopped = rospy.Time(0)
         self.prev_is_voice = None
@@ -333,12 +343,13 @@ def __init__(self):
         # start
         self.speech_prefetch_bytes = int(
             self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)
-        self.speech_prefetch_buffer = str()
+        self.speech_prefetch_buffer = bytearray()
         self.respeaker_audio.start()
         self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate),
                                       self.on_timer)
         self.timer_led = None
         self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led)
+        self.big_data0 = []
 
     def on_shutdown(self):
         try:
@@ -377,15 +388,16 @@ def on_status_led(self, msg):
                                        oneshot=True)
 
     def on_audio(self, data, channel):
+
         self.pub_audios[channel].publish(AudioData(data=data))
         if channel == self.main_channel:
             self.pub_audio.publish(AudioData(data=data))
             if self.is_speeching:
                 if len(self.speech_audio_buffer) == 0:
                     self.speech_audio_buffer = self.speech_prefetch_buffer
-                self.speech_audio_buffer += data
+                for x in data: self.speech_audio_buffer += bytearray([x])
             else:
-                self.speech_prefetch_buffer += data
+                for x in data: self.speech_prefetch_buffer += bytearray([x])
                 self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:]
 
     def on_timer(self, event):
@@ -403,7 +415,7 @@ def on_timer(self, event):
 
         # doa
         if doa != self.prev_doa:
-            self.pub_doa_raw.publish(Int32(data=doa))
+            self.pub_doa_raw.publish(data=int(doa))
             self.prev_doa = doa
 
             msg = PoseStamped()
@@ -425,14 +437,14 @@ def on_timer(self, event):
             self.is_speeching = True
         elif self.is_speeching:
             buf = self.speech_audio_buffer
-            self.speech_audio_buffer = str()
+            self.speech_audio_buffer = bytearray()
             self.is_speeching = False
-            duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth
+            duration = len(buf) * self.respeaker_audio.bitwidth * 8.0 
             duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth
             rospy.loginfo("Speech detected for %.3f seconds" % duration)
             if self.speech_min_duration <= duration < self.speech_max_duration:
 
-                self.pub_speech_audio.publish(AudioData(data=buf))
+                self.pub_speech_audio.publish(AudioData(data=list(buf)))
 
 
 if __name__ == '__main__':

diff --git a/scripts/speech_to_text.py b/scripts/speech_to_text.py
@@ -16,9 +16,9 @@ class SpeechToText(object):
     def __init__(self):
         # format of input audio data
         self.sample_rate = rospy.get_param("~sample_rate", 16000)
-        self.sample_width = rospy.get_param("~sample_width", 2L)
+        self.sample_width = rospy.get_param("~sample_width", 2)
         # language of STT service
-        self.language = rospy.get_param("~language", "ja-JP")
+        self.language = rospy.get_param("~language", "en-US")
         # ignore voice input while the robot is speaking
         self.self_cancellation = rospy.get_param("~self_cancellation", True)
         # time to assume as SPEAKING after tts service is finished
@@ -41,7 +41,7 @@ def __init__(self):
 
         self.pub_speech = rospy.Publisher(
             "speech_to_text", SpeechRecognitionCandidates, queue_size=1)
-        self.sub_audio = rospy.Subscriber("audio", AudioData, self.audio_cb)
+        self.sub_audio = rospy.Subscriber("speech_audio", AudioData, self.audio_cb)
 
     def tts_timer_cb(self, event):
         stamp = event.current_real
@@ -60,23 +60,29 @@ def tts_timer_cb(self, event):
                 self.last_tts = stamp
             if stamp - self.last_tts > self.tts_tolerance:
                 rospy.logdebug("END CANCELLATION")
-                self.is_canceling = False
+                self.is_canceling = Falser
 
     def audio_cb(self, msg):
         if self.is_canceling:
             rospy.loginfo("Speech is cancelled")
             return
-        data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)
+        data = SR.AudioData(bytes(msg.data), self.sample_rate, self.sample_width)
+
         try:
             rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
-            result = self.recognizer.recognize_google(
-                data, language=self.language)
-            msg = SpeechRecognitionCandidates(transcript=[result])
+            result = self.recognizer.recognize_sphinx(
+                data, language=self.language, show_all=False) #, key=None
+            rospy.loginfo(result)
+
+            msg = SpeechRecognitionCandidates(transcript=result)
             self.pub_speech.publish(msg)
+
         except SR.UnknownValueError as e:
             rospy.logerr("Failed to recognize: %s" % str(e))
+            rospy.loginfo("value error")
         except SR.RequestError as e:
             rospy.logerr("Failed to recognize: %s" % str(e))
+            rospy.loginfo("request error")
 
 
 if __name__ == '__main__':