Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROS Noetic, Python 3 Updates #31

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion launch/respeaker.launch
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<node name="speech_to_text" pkg="respeaker_ros" type="speech_to_text.py">
<remap from="audio" to="speech_audio"/>
<rosparam>
language: ja-JP
language: en-US
self_cancellation: true
tts_tolerance: 0.5
</rosparam>
Expand Down
2 changes: 2 additions & 0 deletions package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
<depend condition="$ROS_PYTHON_VERSION == 3">catkin_virtualenv</depend>
<depend>angles</depend>
<depend>audio_common_msgs</depend>
<depend>audio_play</depend>
<depend>sound_play</depend>
<depend>dynamic_reconfigure</depend>
<depend>geometry_msgs</depend>
<depend>portaudio19-dev</depend>
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
PyAudio==0.2.8
SpeechRecognition==3.8.1
click==6.7
numpy==1.16.2
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure? I believe numpy is still required. See scripts/respeaker_node.py line 11

pixel-ring==0.1.0
pyusb==1.0.2
48 changes: 30 additions & 18 deletions scripts/respeaker_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import usb.core
import usb.util
import pyaudio
import wave
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be removed again as well, right? that was for the wav file you used for debugging?

import math
import numpy as np
import tf.transformations as T
Expand All @@ -15,20 +16,21 @@
import struct
import sys
import time
import speech_recognition as SR
from audio_common_msgs.msg import AudioData
from geometry_msgs.msg import PoseStamped
from std_msgs.msg import Bool, Int32, ColorRGBA
from dynamic_reconfigure.server import Server
try:
from pixel_ring import usb_pixel_ring_v2
except IOError as e:
print e
print(e)
raise RuntimeError("Check the device is connected and recognized")

try:
from respeaker_ros.cfg import RespeakerConfig
except Exception as e:
print e
print(e)
raise RuntimeError("Need to run respeaker_gencfg.py first")


Expand Down Expand Up @@ -169,12 +171,14 @@ def read(self, name):
usb.util.CTRL_IN | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE,
0, cmd, id, length, self.TIMEOUT)

response = struct.unpack(b'ii', response.tostring())
#response = struct.unpack(b'ii', struct.pack(b'ii',*response))
response = struct.unpack(b'ii', bytes([x for x in response]))

if data[2] == 'int':
result = response[0]
else:
result = response[0] * (2.**response[1])
#result = response[0]

return result

Expand Down Expand Up @@ -221,16 +225,17 @@ def __init__(self, on_audio, channels=None, suppress_error=True):
self.available_channels = None
self.channels = channels
self.device_index = None
self.rate = 16000
self.bitwidth = 2
self.rate = rospy.get_param("~sample_rate", 16000)
self.bitwidth = rospy.get_param("~sample_width", 2)
self.bitdepth = 16
self.i = 0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is also a debugging left-over?


# find device
count = self.pyaudio.get_device_count()
rospy.logdebug("%d audio devices found" % count)
for i in range(count):
info = self.pyaudio.get_device_info_by_index(i)
name = info["name"].encode("utf-8")
name = str(info["name"].encode("utf-8"))
chan = info["maxInputChannels"]
rospy.logdebug(" - %d: %s" % (i, name))
if name.lower().find("respeaker") >= 0:
Expand Down Expand Up @@ -266,6 +271,7 @@ def __init__(self, on_audio, channels=None, suppress_error=True):
input_device_index=self.device_index,
)


def __del__(self):
self.stop()
try:
Expand All @@ -279,15 +285,19 @@ def __del__(self):
except:
pass


def stream_callback(self, in_data, frame_count, time_info, status):

# split channel
data = np.fromstring(in_data, dtype=np.int16)
chunk_per_channel = len(data) / self.available_channels
data = np.frombuffer(in_data, dtype=np.int16)
chunk_per_channel = np.math.ceil(len(data) / self.available_channels)
data = np.reshape(data, (chunk_per_channel, self.available_channels))
for chan in self.channels:
chan_data = data[:, chan]
chan_data = bytearray(data[:, chan].tobytes())

# invoke callback
self.on_audio(chan_data.tostring(), chan)
self.on_audio(chan_data, chan)

return None, pyaudio.paContinue

def start(self):
Expand Down Expand Up @@ -315,7 +325,7 @@ def __init__(self):
#
self.respeaker = RespeakerInterface()
self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error)
self.speech_audio_buffer = str()
self.speech_audio_buffer = bytearray()
self.is_speeching = False
self.speech_stopped = rospy.Time(0)
self.prev_is_voice = None
Expand All @@ -333,12 +343,13 @@ def __init__(self):
# start
self.speech_prefetch_bytes = int(
self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)
self.speech_prefetch_buffer = str()
self.speech_prefetch_buffer = bytearray()
self.respeaker_audio.start()
self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate),
self.on_timer)
self.timer_led = None
self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led)
self.big_data0 = []

def on_shutdown(self):
try:
Expand Down Expand Up @@ -377,15 +388,16 @@ def on_status_led(self, msg):
oneshot=True)

def on_audio(self, data, channel):

self.pub_audios[channel].publish(AudioData(data=data))
if channel == self.main_channel:
self.pub_audio.publish(AudioData(data=data))
if self.is_speeching:
if len(self.speech_audio_buffer) == 0:
self.speech_audio_buffer = self.speech_prefetch_buffer
self.speech_audio_buffer += data
for x in data: self.speech_audio_buffer += bytearray([x])
else:
self.speech_prefetch_buffer += data
for x in data: self.speech_prefetch_buffer += bytearray([x])
self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:]

def on_timer(self, event):
Expand All @@ -403,7 +415,7 @@ def on_timer(self, event):

# doa
if doa != self.prev_doa:
self.pub_doa_raw.publish(Int32(data=doa))
self.pub_doa_raw.publish(data=int(doa))
self.prev_doa = doa

msg = PoseStamped()
Expand All @@ -425,14 +437,14 @@ def on_timer(self, event):
self.is_speeching = True
elif self.is_speeching:
buf = self.speech_audio_buffer
self.speech_audio_buffer = str()
self.speech_audio_buffer = bytearray()
self.is_speeching = False
duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth
duration = len(buf) * self.respeaker_audio.bitwidth * 8.0
duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth
rospy.loginfo("Speech detected for %.3f seconds" % duration)
if self.speech_min_duration <= duration < self.speech_max_duration:

self.pub_speech_audio.publish(AudioData(data=buf))
self.pub_speech_audio.publish(AudioData(data=list(buf)))


if __name__ == '__main__':
Expand Down
22 changes: 14 additions & 8 deletions scripts/speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ class SpeechToText(object):
def __init__(self):
# format of input audio data
self.sample_rate = rospy.get_param("~sample_rate", 16000)
self.sample_width = rospy.get_param("~sample_width", 2L)
self.sample_width = rospy.get_param("~sample_width", 2)
# language of STT service
self.language = rospy.get_param("~language", "ja-JP")
self.language = rospy.get_param("~language", "en-US")
# ignore voice input while the robot is speaking
self.self_cancellation = rospy.get_param("~self_cancellation", True)
# time to assume as SPEAKING after tts service is finished
Expand All @@ -41,7 +41,7 @@ def __init__(self):

self.pub_speech = rospy.Publisher(
"speech_to_text", SpeechRecognitionCandidates, queue_size=1)
self.sub_audio = rospy.Subscriber("audio", AudioData, self.audio_cb)
self.sub_audio = rospy.Subscriber("speech_audio", AudioData, self.audio_cb)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

public topic name change, meaning API changed. intended?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, this is important, the package appears to be set up to publish raw audio to "audio" and processed, clean audio to "speech_audio", so the speech to text feature ought to use the cleaned up audio for better recognition


def tts_timer_cb(self, event):
stamp = event.current_real
Expand All @@ -60,23 +60,29 @@ def tts_timer_cb(self, event):
self.last_tts = stamp
if stamp - self.last_tts > self.tts_tolerance:
rospy.logdebug("END CANCELLATION")
self.is_canceling = False
self.is_canceling = Falser

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes


def audio_cb(self, msg):
if self.is_canceling:
rospy.loginfo("Speech is cancelled")
return
data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)
data = SR.AudioData(bytes(msg.data), self.sample_rate, self.sample_width)

try:
rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
result = self.recognizer.recognize_google(
data, language=self.language)
msg = SpeechRecognitionCandidates(transcript=[result])
result = self.recognizer.recognize_sphinx(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not an expert on this, but google->sphinx may change user experience somehow

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be removed now. specific to my use case

data, language=self.language, show_all=False) #, key=None
rospy.loginfo(result)

msg = SpeechRecognitionCandidates(transcript=result)
self.pub_speech.publish(msg)

except SR.UnknownValueError as e:
rospy.logerr("Failed to recognize: %s" % str(e))
rospy.loginfo("value error")
except SR.RequestError as e:
rospy.logerr("Failed to recognize: %s" % str(e))
rospy.loginfo("request error")


if __name__ == '__main__':
Expand Down