From 1af3483cbbfef69dd6b14db0f61456113fb7daa8 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Mon, 17 Apr 2023 12:02:51 +0900 Subject: [PATCH 01/19] [webrtcvad_ros] add python package --- webrtcvad_ros/CMakeLists.txt | 2 ++ webrtcvad_ros/README.md | 6 +++++- webrtcvad_ros/python/webrtcvad_ros/__init__.py | 0 webrtcvad_ros/setup.py | 9 +++++++++ 4 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 webrtcvad_ros/python/webrtcvad_ros/__init__.py create mode 100644 webrtcvad_ros/setup.py diff --git a/webrtcvad_ros/CMakeLists.txt b/webrtcvad_ros/CMakeLists.txt index 2e30c5afb..ea158642d 100644 --- a/webrtcvad_ros/CMakeLists.txt +++ b/webrtcvad_ros/CMakeLists.txt @@ -3,4 +3,6 @@ project(webrtcvad_ros) find_package(catkin REQUIRED) +catkin_python_setup() + catkin_package() diff --git a/webrtcvad_ros/README.md b/webrtcvad_ros/README.md index 38100cd29..0edb87671 100644 --- a/webrtcvad_ros/README.md +++ b/webrtcvad_ros/README.md @@ -1,6 +1,10 @@ # webrtcvad_ros -This package provides a wrapper node for [webrtcvad](https://github.com/wiseman/py-webrtcvad). It subscribes an audio topic and publish a flag if curretly speeched or not with VAD. +This package provides VAD (Voice Activity Detection) code. It subscribes an audio topic and publish a flag if curretly speeched or not with VAD. +There is 2 types of implementation. + +1. One uses [webrtcvad](https://github.com/wiseman/py-webrtcvad). +2. One uses [silero-vad](https://github.com/snakers4/silero-vad/tree/master/examples/microphone_and_webRTC_integration) ## Prerequities diff --git a/webrtcvad_ros/python/webrtcvad_ros/__init__.py b/webrtcvad_ros/python/webrtcvad_ros/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/webrtcvad_ros/setup.py b/webrtcvad_ros/setup.py new file mode 100644 index 000000000..1e6030df6 --- /dev/null +++ b/webrtcvad_ros/setup.py @@ -0,0 +1,9 @@ +from distutils.core import setup +from catkin_pkg.python_setup import generate_distutils_setup + +d = generate_distutils_setup( + packages=['webrtcvad_ros'], + package_dir={'': 'python'} +) + +setup(**d) From a92e0d2c97c98081a8e5495736004c6d389d5f08 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Mon, 17 Apr 2023 12:19:22 +0900 Subject: [PATCH 02/19] [webrtcvad_ros] make vad_node as a library and rename node --- webrtcvad_ros/launch/sample.launch | 14 ++-- webrtcvad_ros/node_scripts/vad.py | 30 ++++++++ webrtcvad_ros/node_scripts/webrtcvad_ros.py | 59 --------------- .../python/webrtcvad_ros/vad_core.py | 71 +++++++++++++++++++ 4 files changed, 108 insertions(+), 66 deletions(-) create mode 100755 webrtcvad_ros/node_scripts/vad.py delete mode 100755 webrtcvad_ros/node_scripts/webrtcvad_ros.py create mode 100644 webrtcvad_ros/python/webrtcvad_ros/vad_core.py diff --git a/webrtcvad_ros/launch/sample.launch b/webrtcvad_ros/launch/sample.launch index a5c865e59..ed8aeb43c 100644 --- a/webrtcvad_ros/launch/sample.launch +++ b/webrtcvad_ros/launch/sample.launch @@ -1,19 +1,19 @@ - + + > aggressiveness: 1 - - + + + > language: ja-JP @@ -35,5 +35,5 @@ pkg="webrtcvad_ros" type="print_stt_result.py" output="screen" - /> + /> diff --git a/webrtcvad_ros/node_scripts/vad.py b/webrtcvad_ros/node_scripts/vad.py new file mode 100755 index 000000000..a36827667 --- /dev/null +++ b/webrtcvad_ros/node_scripts/vad.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +import rospy +import webrtcvad + +from webrtcvad_ros.vad_core import VADBaseNode + + +class WebRTCVADROS(VADBaseNode): + + def __init__(self): + + super(WebRTCVADROS, self).__init__() + + aggressiveness = rospy.get_param('~aggressiveness', 1) + self._vad = webrtcvad.Vad(int(aggressiveness)) + + def _get_vad_confidence(self, chunk, sampling_rate): + return 1.0 if self._vad.is_speech(chunk, sampling_rate) else 0.0 + + +def main(): + + rospy.init_node('webrtcvad_ros') + node = WebRTCVADROS() + rospy.spin() + + +if __name__ == '__main__': + main() diff --git a/webrtcvad_ros/node_scripts/webrtcvad_ros.py b/webrtcvad_ros/node_scripts/webrtcvad_ros.py deleted file mode 100755 index 38fc91ad8..000000000 --- a/webrtcvad_ros/node_scripts/webrtcvad_ros.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python - -import webrtcvad -import rospy -from audio_common_msgs.msg import AudioData, AudioInfo -from std_msgs.msg import Bool - -class WebRTCVADROS(object): - - def __init__(self): - - self._current_speaking = False - self._speech_audio_buffer = '' - - aggressiveness = rospy.get_param('~aggressiveness',1) - self._minimum_duration = rospy.get_param('~minimum_duration',0.4) - self._vad = webrtcvad.Vad(int(aggressiveness)) - - self._pub_is_speech = rospy.Publisher('~is_speeching',Bool,queue_size=1) - self._pub_speech_audio = rospy.Publisher('~speech_audio',AudioData,queue_size=1) - self._pub_speech_audio_info = rospy.Publisher('~speech_audio_info',AudioInfo,queue_size=1,latch=True) - - self._audio_info = rospy.wait_for_message('audio_info',AudioInfo) - if self._audio_info.sample_format != 'S16LE': - rospy.logerr('audio format must be S16LE') - return - if self._audio_info.sample_rate not in [8000, 16000, 32000, 48000]: - rospy.logerr('sampling rate must be 8000 or 16000 or 32000 or 48000') - return - - self._pub_speech_audio_info.publish(self._audio_info) - self._sub = rospy.Subscriber('audio_data',AudioData,self._callback) - - def _callback(self, msg): - is_speech = self._vad.is_speech(msg.data, self._audio_info.sample_rate) - self._pub_is_speech.publish(Bool(is_speech)) - if self._current_speaking == True and is_speech == True: - self._speech_audio_buffer = self._speech_audio_buffer + msg.data - elif self._current_speaking == False and is_speech == True: - self._speech_audio_buffer = msg.data - self._current_speaking = True - elif self._current_speaking == True and is_speech == False: - self._speech_audio_buffer = self._speech_audio_buffer + msg.data - speech_duration = (len(self._speech_audio_buffer) / 2.0) / self._audio_info.sample_rate - if speech_duration > self._minimum_duration: - self._pub_speech_audio.publish(AudioData(self._speech_audio_buffer)) - else: - rospy.logwarn('speech duration: {} dropped'.format(speech_duration)) - self._current_speaking = False - self._speech_audio_buffer = '' - -def main(): - - rospy.init_node('webrtcvad_ros') - node = WebRTCVADROS() - rospy.spin() - -if __name__=='__main__': - main() diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py new file mode 100644 index 000000000..2d46a73b7 --- /dev/null +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import rospy +from audio_common_msgs.msg import AudioData, AudioInfo +from std_msgs.msg import Bool + + +class VADBaseNode(object): + + def __init__(self): + + self._current_speaking = False + self._speech_audio_buffer = '' + + self._threshold = rospy.get_param('~threshold', 0.5) + self._minimum_duration = rospy.get_param('~minimum_duration', 0.4) + + self._pub_is_speech = rospy.Publisher('~is_speeching', Bool, queue_size=1) + self._pub_speech_audio = rospy.Publisher('~speech_audio', + AudioData, + queue_size=1) + self._pub_speech_audio_info = rospy.Publisher('~speech_audio_info', + AudioInfo, + queue_size=1, + latch=True) + + self._audio_info = rospy.wait_for_message('audio_info', AudioInfo) + if self._audio_info.sample_format != 'S16LE': + rospy.logerr('audio format must be S16LE') + return + if self._audio_info.sample_rate not in [8000, 16000, 32000, 48000]: + rospy.logerr('sampling rate must be 8000 or 16000 or 32000 or 48000') + return + + self._pub_speech_audio_info.publish(self._audio_info) + self._sub = rospy.Subscriber('audio_data', AudioData, self._callback) + + def _get_vad_confidence(self, chunk, sampling_rate): + raise NotImplementedError + + def _callback(self, msg): + confidence = self._get_vad_confidence(msg.data, + self._audio_info.sample_rate) + is_speech = True if confidence > self._threshold else False + self._pub_is_speech.publish(Bool(is_speech)) + if self._current_speaking == True and is_speech == True: + self._speech_audio_buffer = self._speech_audio_buffer + msg.data + elif self._current_speaking == False and is_speech == True: + self._speech_audio_buffer = msg.data + self._current_speaking = True + elif self._current_speaking == True and is_speech == False: + self._speech_audio_buffer = self._speech_audio_buffer + msg.data + speech_duration = (len(self._speech_audio_buffer) / + 2.0) / self._audio_info.sample_rate + if speech_duration > self._minimum_duration: + self._pub_speech_audio.publish(AudioData(self._speech_audio_buffer)) + else: + rospy.logwarn('speech duration: {} dropped'.format(speech_duration)) + self._current_speaking = False + self._speech_audio_buffer = '' + + +def main(): + + rospy.init_node('webrtcvad_ros') + node = WebRTCVADROS() + rospy.spin() + + +if __name__ == '__main__': + main() From 4e741500b8d5e0eb7fa38591170a5809fd6dfcef Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Mon, 17 Apr 2023 12:20:17 +0900 Subject: [PATCH 03/19] [webrtcvad_ros] update README.md --- webrtcvad_ros/README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/webrtcvad_ros/README.md b/webrtcvad_ros/README.md index 0edb87671..ded7ab582 100644 --- a/webrtcvad_ros/README.md +++ b/webrtcvad_ros/README.md @@ -1,10 +1,7 @@ # webrtcvad_ros This package provides VAD (Voice Activity Detection) code. It subscribes an audio topic and publish a flag if curretly speeched or not with VAD. -There is 2 types of implementation. - -1. One uses [webrtcvad](https://github.com/wiseman/py-webrtcvad). -2. One uses [silero-vad](https://github.com/snakers4/silero-vad/tree/master/examples/microphone_and_webRTC_integration) +This package uses [webrtcvad](https://github.com/wiseman/py-webrtcvad). ## Prerequities From fc2090462e77cf9b9753067cefbfb9be64c18842 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Mon, 17 Apr 2023 12:33:47 +0900 Subject: [PATCH 04/19] [silero_vad_ros] add silero_vad_ros package --- silero_vad_ros/CMakeLists.txt | 7 +++++ silero_vad_ros/node_scripts/vad.py | 42 ++++++++++++++++++++++++++++++ silero_vad_ros/package.xml | 24 +++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 silero_vad_ros/CMakeLists.txt create mode 100644 silero_vad_ros/node_scripts/vad.py create mode 100644 silero_vad_ros/package.xml diff --git a/silero_vad_ros/CMakeLists.txt b/silero_vad_ros/CMakeLists.txt new file mode 100644 index 000000000..138dd11b9 --- /dev/null +++ b/silero_vad_ros/CMakeLists.txt @@ -0,0 +1,7 @@ +cmake_minimum_required(VERSION 3.0.2) +project(silero_vad_ros) + +find_package(catkin REQUIRED) + +catkin_package( +) diff --git a/silero_vad_ros/node_scripts/vad.py b/silero_vad_ros/node_scripts/vad.py new file mode 100644 index 000000000..ac2841477 --- /dev/null +++ b/silero_vad_ros/node_scripts/vad.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +import struct + +import numpy as np +import rospy +import torch + +from webrtcvad_ros.vad_core import VADBaseNode + + +class SileroVADROS(VADBaseNode): + + def __init__(self): + + super(SileroVADROS, self).__init__() + + self.model_vad, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad', + model='silero_vad', + force_reload=True) + + def _get_vad_confidence(self, chunk, sampling_rate): + return self.model_vad( + torch.from_numpy(self._convert_bytearray_to_numpy_array(chunk)), + sampling_rate).item() + + def _convert_bytearray_to_numpy_array(self, data): + if self._audio_info.sample_format == 'S16LE': + return np.array(struct.unpack("{}h".format(len(data) / 2), data)) + else: + return np.array([]) + + +def main(): + + rospy.init_node('_ros') + node = WebRTCVADROS() + rospy.spin() + + +if __name__ == '__main__': + main() diff --git a/silero_vad_ros/package.xml b/silero_vad_ros/package.xml new file mode 100644 index 000000000..733e5efdf --- /dev/null +++ b/silero_vad_ros/package.xml @@ -0,0 +1,24 @@ + + + silero_vad_ros + 2.1.24 + This package provides a wrapper node for silero_vad. It subscribes an audio topic and + publish a flag if curretly speeched or not with VAD. + + Kei Okada + Koki Shinjo + + Koki Shinjo + + BSD + + catkin + + audio_common_msgs + rospy + python-webrtcvad-pip + webrtcvad_ros + + + + From ef0bf964d0aedd2062ec76f5ba0a815a895f084a Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 19:21:16 +0900 Subject: [PATCH 05/19] [silero_vad_ros] update --- silero_vad_ros/CMakeLists.txt | 11 +++++++- silero_vad_ros/launch/sample.launch | 39 +++++++++++++++++++++++++++++ silero_vad_ros/node_scripts/vad.py | 24 ++++++++++-------- silero_vad_ros/package.xml | 3 +++ silero_vad_ros/requirements.txt | 1 + 5 files changed, 67 insertions(+), 11 deletions(-) create mode 100644 silero_vad_ros/launch/sample.launch create mode 100644 silero_vad_ros/requirements.txt diff --git a/silero_vad_ros/CMakeLists.txt b/silero_vad_ros/CMakeLists.txt index 138dd11b9..79549a027 100644 --- a/silero_vad_ros/CMakeLists.txt +++ b/silero_vad_ros/CMakeLists.txt @@ -1,7 +1,16 @@ cmake_minimum_required(VERSION 3.0.2) project(silero_vad_ros) -find_package(catkin REQUIRED) +find_package(catkin REQUIRED COMPONENTS catkin_virtualenv) + +catkin_generate_virtualenv( + PYTHON_INTERPRETER python3.8 + CHECK_VENV FALSE +) catkin_package( ) + +catkin_install_python(PROGRAMS + node_scripts/vad.py + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) diff --git a/silero_vad_ros/launch/sample.launch b/silero_vad_ros/launch/sample.launch new file mode 100644 index 000000000..30ff409a0 --- /dev/null +++ b/silero_vad_ros/launch/sample.launch @@ -0,0 +1,39 @@ + + + + + + + + threshold: 0.5 + + + + + + + + + language: ja-JP + self_cancellation: false + tts_tolerance: 1.0 + + + + + diff --git a/silero_vad_ros/node_scripts/vad.py b/silero_vad_ros/node_scripts/vad.py index ac2841477..74221620b 100644 --- a/silero_vad_ros/node_scripts/vad.py +++ b/silero_vad_ros/node_scripts/vad.py @@ -13,28 +13,32 @@ class SileroVADROS(VADBaseNode): def __init__(self): + model_vad, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad', + model='silero_vad', + force_reload=True) + self.model_vad = model_vad + super(SileroVADROS, self).__init__() - self.model_vad, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad', - model='silero_vad', - force_reload=True) + rospy.loginfo('Initialized.') def _get_vad_confidence(self, chunk, sampling_rate): - return self.model_vad( - torch.from_numpy(self._convert_bytearray_to_numpy_array(chunk)), - sampling_rate).item() + print('chunk: {}'.format(chunk)) + nparray = self._convert_bytearray_to_numpy_array(chunk) + print('nparray: {}'.format(nparray)) + return self.model_vad(torch.from_numpy(nparray), sampling_rate).item() def _convert_bytearray_to_numpy_array(self, data): if self._audio_info.sample_format == 'S16LE': - return np.array(struct.unpack("{}h".format(len(data) / 2), data)) + return np.array(struct.unpack("{}h".format(int(len(data) / 2)), data)) else: - return np.array([]) + raise ValueError() def main(): - rospy.init_node('_ros') - node = WebRTCVADROS() + rospy.init_node('silero_vad_ros') + node = SileroVADROS() rospy.spin() diff --git a/silero_vad_ros/package.xml b/silero_vad_ros/package.xml index 733e5efdf..31d64cba2 100644 --- a/silero_vad_ros/package.xml +++ b/silero_vad_ros/package.xml @@ -14,11 +14,14 @@ catkin + catkin_virtualenv + audio_common_msgs rospy python-webrtcvad-pip webrtcvad_ros + requirements.txt diff --git a/silero_vad_ros/requirements.txt b/silero_vad_ros/requirements.txt new file mode 100644 index 000000000..3daffcdd3 --- /dev/null +++ b/silero_vad_ros/requirements.txt @@ -0,0 +1 @@ +torchaudio From ee14dcb7642f8dfbf9fd98b10231b822ae9edc54 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 20:46:16 +0900 Subject: [PATCH 06/19] [webrtcvad_ros] enable change chunk_size --- webrtcvad_ros/node_scripts/vad.py | 4 +-- .../python/webrtcvad_ros/vad_core.py | 27 +++++++++++++------ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/webrtcvad_ros/node_scripts/vad.py b/webrtcvad_ros/node_scripts/vad.py index a36827667..df68bc68d 100755 --- a/webrtcvad_ros/node_scripts/vad.py +++ b/webrtcvad_ros/node_scripts/vad.py @@ -10,11 +10,11 @@ class WebRTCVADROS(VADBaseNode): def __init__(self): - super(WebRTCVADROS, self).__init__() - aggressiveness = rospy.get_param('~aggressiveness', 1) self._vad = webrtcvad.Vad(int(aggressiveness)) + super(WebRTCVADROS, self).__init__(chunk_size=480) + def _get_vad_confidence(self, chunk, sampling_rate): return 1.0 if self._vad.is_speech(chunk, sampling_rate) else 0.0 diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index 2d46a73b7..f32347db7 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +import sys + import rospy from audio_common_msgs.msg import AudioData, AudioInfo from std_msgs.msg import Bool @@ -7,10 +9,11 @@ class VADBaseNode(object): - def __init__(self): + def __init__(self, chunk_size=None): self._current_speaking = False - self._speech_audio_buffer = '' + self._speech_audio_buffer = b'' + self._audio_buffer = b'' self._threshold = rospy.get_param('~threshold', 0.5) self._minimum_duration = rospy.get_param('~minimum_duration', 0.4) @@ -27,11 +30,12 @@ def __init__(self): self._audio_info = rospy.wait_for_message('audio_info', AudioInfo) if self._audio_info.sample_format != 'S16LE': rospy.logerr('audio format must be S16LE') - return + sys.exit(1) if self._audio_info.sample_rate not in [8000, 16000, 32000, 48000]: rospy.logerr('sampling rate must be 8000 or 16000 or 32000 or 48000') - return + sys.exit(1) + self.chunk_size = chunk_size self._pub_speech_audio_info.publish(self._audio_info) self._sub = rospy.Subscriber('audio_data', AudioData, self._callback) @@ -39,17 +43,24 @@ def _get_vad_confidence(self, chunk, sampling_rate): raise NotImplementedError def _callback(self, msg): - confidence = self._get_vad_confidence(msg.data, + self._audio_buffer += msg.data + if len(self._audio_buffer) < 2 * self.chunk_size: + return + else: + audio_data = self._audio_buffer[:2 * self.chunk_size] + self._audio_buffer = self._audio_buffer[2 * self.chunk_size:] + + confidence = self._get_vad_confidence(audio_data, self._audio_info.sample_rate) is_speech = True if confidence > self._threshold else False self._pub_is_speech.publish(Bool(is_speech)) if self._current_speaking == True and is_speech == True: - self._speech_audio_buffer = self._speech_audio_buffer + msg.data + self._speech_audio_buffer = self._speech_audio_buffer + audio_data elif self._current_speaking == False and is_speech == True: - self._speech_audio_buffer = msg.data + self._speech_audio_buffer = audio_data self._current_speaking = True elif self._current_speaking == True and is_speech == False: - self._speech_audio_buffer = self._speech_audio_buffer + msg.data + self._speech_audio_buffer = self._speech_audio_buffer + audio_data speech_duration = (len(self._speech_audio_buffer) / 2.0) / self._audio_info.sample_rate if speech_duration > self._minimum_duration: From dd33c280ae47c0f8cd7e33cc361691e10a7fa1e0 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 21:12:58 +0900 Subject: [PATCH 07/19] [webrtcvad_ros] update --- webrtcvad_ros/python/webrtcvad_ros/vad_core.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index f32347db7..ae570385a 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -52,12 +52,13 @@ def _callback(self, msg): confidence = self._get_vad_confidence(audio_data, self._audio_info.sample_rate) + rospy.loginfo('confidence: {}'.format(confidence)) is_speech = True if confidence > self._threshold else False self._pub_is_speech.publish(Bool(is_speech)) if self._current_speaking == True and is_speech == True: self._speech_audio_buffer = self._speech_audio_buffer + audio_data elif self._current_speaking == False and is_speech == True: - self._speech_audio_buffer = audio_data + self._speech_audio_buffer = self._speech_audio_buffer + audio_data self._current_speaking = True elif self._current_speaking == True and is_speech == False: self._speech_audio_buffer = self._speech_audio_buffer + audio_data @@ -68,7 +69,9 @@ def _callback(self, msg): else: rospy.logwarn('speech duration: {} dropped'.format(speech_duration)) self._current_speaking = False - self._speech_audio_buffer = '' + self._speech_audio_buffer = audio_data + else: + self._speech_audio_buffer = audio_data def main(): From 956a517642b89feaeedeadda31a0a7b1bb15cb3c Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 21:32:35 +0900 Subject: [PATCH 08/19] [webrtcvad_ros] enable continuing and prebuffer --- .../python/webrtcvad_ros/vad_core.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index ae570385a..bb55ffc27 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -11,12 +11,14 @@ class VADBaseNode(object): def __init__(self, chunk_size=None): - self._current_speaking = False + self._last_speaking_time = rospy.Time() self._speech_audio_buffer = b'' self._audio_buffer = b'' - self._threshold = rospy.get_param('~threshold', 0.5) + self._threshold = rospy.get_param('~threshold', 0.1) self._minimum_duration = rospy.get_param('~minimum_duration', 0.4) + self._audio_timeout_duration = rospy.get_param('~audio_timeout_duration', + 0.5) self._pub_is_speech = rospy.Publisher('~is_speeching', Bool, queue_size=1) self._pub_speech_audio = rospy.Publisher('~speech_audio', @@ -52,26 +54,30 @@ def _callback(self, msg): confidence = self._get_vad_confidence(audio_data, self._audio_info.sample_rate) - rospy.loginfo('confidence: {}'.format(confidence)) + rospy.logdebug('confidence: {}'.format(confidence)) is_speech = True if confidence > self._threshold else False self._pub_is_speech.publish(Bool(is_speech)) - if self._current_speaking == True and is_speech == True: - self._speech_audio_buffer = self._speech_audio_buffer + audio_data - elif self._current_speaking == False and is_speech == True: - self._speech_audio_buffer = self._speech_audio_buffer + audio_data - self._current_speaking = True - elif self._current_speaking == True and is_speech == False: + + if is_speech: self._speech_audio_buffer = self._speech_audio_buffer + audio_data - speech_duration = (len(self._speech_audio_buffer) / - 2.0) / self._audio_info.sample_rate - if speech_duration > self._minimum_duration: - self._pub_speech_audio.publish(AudioData(self._speech_audio_buffer)) - else: - rospy.logwarn('speech duration: {} dropped'.format(speech_duration)) - self._current_speaking = False - self._speech_audio_buffer = audio_data + self._last_speaking_time = rospy.Time.now() else: - self._speech_audio_buffer = audio_data + if rospy.Time.now() < self._last_speaking_time + rospy.Duration( + self._audio_timeout_duration): + self._speech_audio_buffer = self._speech_audio_buffer + audio_data + rospy.logdebug('continuing...') + else: + if len(self._speech_audio_buffer) > len(audio_data): + self._speech_audio_buffer = self._speech_audio_buffer + audio_data + speech_duration = (len(self._speech_audio_buffer) / 2.0) / self._audio_info.sample_rate \ + - self.chunk_size / self._audio_info.sample_rate \ + - self._audio_timeout_duration + if speech_duration > self._minimum_duration: + self._pub_speech_audio.publish(AudioData(self._speech_audio_buffer)) + rospy.loginfo('published duration: {}'.format(speech_duration)) + else: + rospy.logwarn('speech duration: {} dropped'.format(speech_duration)) + self._speech_audio_buffer = audio_data def main(): From 74930a1ad6c702850a293d14380a614981e0da1c Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 21:33:59 +0900 Subject: [PATCH 09/19] [webrtcvad_ros] add comment --- webrtcvad_ros/python/webrtcvad_ros/vad_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index bb55ffc27..1d861528e 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -69,6 +69,7 @@ def _callback(self, msg): else: if len(self._speech_audio_buffer) > len(audio_data): self._speech_audio_buffer = self._speech_audio_buffer + audio_data + # speech_duration = speech_audio_buffer duration - pre buffer duration - timeout buffer duration speech_duration = (len(self._speech_audio_buffer) / 2.0) / self._audio_info.sample_rate \ - self.chunk_size / self._audio_info.sample_rate \ - self._audio_timeout_duration From 47de0da4ab7241f5dd8bcaff2a61c8c8c62f87a0 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 21:34:57 +0900 Subject: [PATCH 10/19] [silero_vad_ros] fix bugs --- silero_vad_ros/launch/sample.launch | 2 +- silero_vad_ros/node_scripts/vad.py | 19 ++++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/silero_vad_ros/launch/sample.launch b/silero_vad_ros/launch/sample.launch index 30ff409a0..79d1c0daa 100644 --- a/silero_vad_ros/launch/sample.launch +++ b/silero_vad_ros/launch/sample.launch @@ -22,7 +22,7 @@ type="speech_to_text.py" output="log" > - + language: ja-JP self_cancellation: false diff --git a/silero_vad_ros/node_scripts/vad.py b/silero_vad_ros/node_scripts/vad.py index 74221620b..3f1c48eb4 100644 --- a/silero_vad_ros/node_scripts/vad.py +++ b/silero_vad_ros/node_scripts/vad.py @@ -18,21 +18,18 @@ def __init__(self): force_reload=True) self.model_vad = model_vad - super(SileroVADROS, self).__init__() + super(SileroVADROS, self).__init__(chunk_size=1536) rospy.loginfo('Initialized.') def _get_vad_confidence(self, chunk, sampling_rate): - print('chunk: {}'.format(chunk)) - nparray = self._convert_bytearray_to_numpy_array(chunk) - print('nparray: {}'.format(nparray)) - return self.model_vad(torch.from_numpy(nparray), sampling_rate).item() - - def _convert_bytearray_to_numpy_array(self, data): - if self._audio_info.sample_format == 'S16LE': - return np.array(struct.unpack("{}h".format(int(len(data) / 2)), data)) - else: - raise ValueError() + audio_chunk = np.frombuffer(chunk, np.int16) + abs_max = np.abs(audio_chunk).max() + audio_chunk = audio_chunk.astype('float32') + if abs_max > 0: + audio_chunk *= 1 / 32768 + audio_chunk = audio_chunk.squeeze() + return self.model_vad(torch.from_numpy(audio_chunk), sampling_rate).item() def main(): From b526e17b76ea01e5b0582a1a440e452a63a924fc Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 21:57:58 +0900 Subject: [PATCH 11/19] [silero_vad_ros] Fix virtualenv --- silero_vad_ros/CMakeLists.txt | 1 + silero_vad_ros/requirements.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/silero_vad_ros/CMakeLists.txt b/silero_vad_ros/CMakeLists.txt index 79549a027..dd5e32d17 100644 --- a/silero_vad_ros/CMakeLists.txt +++ b/silero_vad_ros/CMakeLists.txt @@ -5,6 +5,7 @@ find_package(catkin REQUIRED COMPONENTS catkin_virtualenv) catkin_generate_virtualenv( PYTHON_INTERPRETER python3.8 + USE_SYSTEM_PACKAGES FALSE CHECK_VENV FALSE ) diff --git a/silero_vad_ros/requirements.txt b/silero_vad_ros/requirements.txt index 3daffcdd3..aaf7fc1be 100644 --- a/silero_vad_ros/requirements.txt +++ b/silero_vad_ros/requirements.txt @@ -1 +1,3 @@ +torch torchaudio +numpy From a86bfa3d8069898a508a2231f4ba6e6ee730032a Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 20 Apr 2023 22:15:08 +0900 Subject: [PATCH 12/19] [silero_vad_ros] add README.md --- silero_vad_ros/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 silero_vad_ros/README.md diff --git a/silero_vad_ros/README.md b/silero_vad_ros/README.md new file mode 100644 index 000000000..4788df244 --- /dev/null +++ b/silero_vad_ros/README.md @@ -0,0 +1,21 @@ +# silero_vad_ros + +This package provides VAD (Voice Activity Detection) code. It subscribes an audio topic and publish a flag if curretly speeched or not with VAD. +This package uses [silero-vad](https://github.com/snakers4/silero-vad). + +## How to build + +```bash +catkin build silero_vad_ros +``` + +## Example + +Please make sure your PC has a microphone. +And then launch. + +```bash +roslaunch silero_vad_ros sample.launch +``` + +And please talk to the microphone. From b75422a014e2c0f03874be7cf1f83d667573dbd9 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Fri, 21 Apr 2023 11:19:05 +0900 Subject: [PATCH 13/19] [silero_vad_ros] update cmake version, skip if python3.8 not found --- silero_vad_ros/CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/silero_vad_ros/CMakeLists.txt b/silero_vad_ros/CMakeLists.txt index dd5e32d17..a650598bf 100644 --- a/silero_vad_ros/CMakeLists.txt +++ b/silero_vad_ros/CMakeLists.txt @@ -1,6 +1,14 @@ -cmake_minimum_required(VERSION 3.0.2) +cmake_minimum_required(VERSION 2.8.3) project(silero_vad_ros) +find_program(EXIST_PYTHON3 "python3.8") +if(NOT EXIST_PYTHON3) + message(WARNING "python3.8 command not found. exit without building") + return() +else() + message(STATUS "python3.8 command found. continue building...") +endif() + find_package(catkin REQUIRED COMPONENTS catkin_virtualenv) catkin_generate_virtualenv( From 3efb9456d517ffa42b43d81aa6670eff29ab8c89 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 21 Mar 2024 10:23:34 +0900 Subject: [PATCH 14/19] [silero_vad_ros] update demo launch --- silero_vad_ros/launch/sample.launch | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/silero_vad_ros/launch/sample.launch b/silero_vad_ros/launch/sample.launch index 79d1c0daa..db735e1b7 100644 --- a/silero_vad_ros/launch/sample.launch +++ b/silero_vad_ros/launch/sample.launch @@ -1,4 +1,6 @@ + + @@ -8,9 +10,10 @@ pkg="silero_vad_ros" type="vad.py" output="screen" + respawn="true" > - - threshold: 0.5 + + threshold: $(arg vad_threshold) @@ -25,7 +28,12 @@ language: ja-JP - self_cancellation: false + self_cancellation: true + tts_actions_names: + - sound_play + - speed_play_jp + - robotsound + - robotsound_jp tts_tolerance: 1.0 From c0da4f850bf9e4d2685a4f138a32bc3d571faaa6 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 28 Mar 2024 12:46:19 +0900 Subject: [PATCH 15/19] [webrtcvad_ros] add maximum duration --- .../python/webrtcvad_ros/vad_core.py | 150 +++++++++--------- 1 file changed, 72 insertions(+), 78 deletions(-) diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index 1d861528e..ff2cdf03e 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -9,84 +9,78 @@ class VADBaseNode(object): - def __init__(self, chunk_size=None): + def __init__(self, chunk_size: int = 480): + + self._last_speaking_time = rospy.Time() + self._speech_audio_buffer = b"" + self._audio_buffer = b"" + + self._threshold = rospy.get_param("~threshold", 0.1) + self._minimum_duration = rospy.get_param("~minimum_duration", 1.0) + self._maximum_duration = rospy.get_param("~maximum_duration", 10.0) + self._audio_timeout_duration = rospy.get_param("~audio_timeout_duration", 0.3) + + self._pub_is_speech = rospy.Publisher("~is_speeching", Bool, queue_size=1) + self._pub_speech_audio = rospy.Publisher( + "~speech_audio", AudioData, queue_size=1 + ) + self._pub_speech_audio_info = rospy.Publisher( + "~speech_audio_info", AudioInfo, queue_size=1, latch=True + ) + + self._audio_info = rospy.wait_for_message("audio_info", AudioInfo) + if self._audio_info.sample_format != "S16LE": + rospy.logerr("audio format must be S16LE") + sys.exit(1) + if self._audio_info.sample_rate not in [8000, 16000, 32000, 48000]: + rospy.logerr("sampling rate must be 8000 or 16000 or 32000 or 48000") + sys.exit(1) + + self.chunk_size = chunk_size + self._pub_speech_audio_info.publish(self._audio_info) + self._sub = rospy.Subscriber("audio_data", AudioData, self._callback) + + def _get_vad_confidence(self, chunk: bytearray, sampling_rate: int) -> float: + raise NotImplementedError + + def _callback(self, msg): + self._audio_buffer += msg.data + if len(self._audio_buffer) < 2 * self.chunk_size: + return + else: + audio_data = self._audio_buffer[: 2 * self.chunk_size] + self._audio_buffer = self._audio_buffer[2 * self.chunk_size :] + + confidence = self._get_vad_confidence(audio_data, self._audio_info.sample_rate) + rospy.logdebug("confidence: {}".format(confidence)) + is_speech = True if confidence > self._threshold else False + self._pub_is_speech.publish(Bool(is_speech)) - self._last_speaking_time = rospy.Time() - self._speech_audio_buffer = b'' - self._audio_buffer = b'' - - self._threshold = rospy.get_param('~threshold', 0.1) - self._minimum_duration = rospy.get_param('~minimum_duration', 0.4) - self._audio_timeout_duration = rospy.get_param('~audio_timeout_duration', - 0.5) - - self._pub_is_speech = rospy.Publisher('~is_speeching', Bool, queue_size=1) - self._pub_speech_audio = rospy.Publisher('~speech_audio', - AudioData, - queue_size=1) - self._pub_speech_audio_info = rospy.Publisher('~speech_audio_info', - AudioInfo, - queue_size=1, - latch=True) - - self._audio_info = rospy.wait_for_message('audio_info', AudioInfo) - if self._audio_info.sample_format != 'S16LE': - rospy.logerr('audio format must be S16LE') - sys.exit(1) - if self._audio_info.sample_rate not in [8000, 16000, 32000, 48000]: - rospy.logerr('sampling rate must be 8000 or 16000 or 32000 or 48000') - sys.exit(1) - - self.chunk_size = chunk_size - self._pub_speech_audio_info.publish(self._audio_info) - self._sub = rospy.Subscriber('audio_data', AudioData, self._callback) - - def _get_vad_confidence(self, chunk, sampling_rate): - raise NotImplementedError - - def _callback(self, msg): - self._audio_buffer += msg.data - if len(self._audio_buffer) < 2 * self.chunk_size: - return - else: - audio_data = self._audio_buffer[:2 * self.chunk_size] - self._audio_buffer = self._audio_buffer[2 * self.chunk_size:] - - confidence = self._get_vad_confidence(audio_data, - self._audio_info.sample_rate) - rospy.logdebug('confidence: {}'.format(confidence)) - is_speech = True if confidence > self._threshold else False - self._pub_is_speech.publish(Bool(is_speech)) - - if is_speech: - self._speech_audio_buffer = self._speech_audio_buffer + audio_data - self._last_speaking_time = rospy.Time.now() - else: - if rospy.Time.now() < self._last_speaking_time + rospy.Duration( - self._audio_timeout_duration): self._speech_audio_buffer = self._speech_audio_buffer + audio_data - rospy.logdebug('continuing...') - else: - if len(self._speech_audio_buffer) > len(audio_data): - self._speech_audio_buffer = self._speech_audio_buffer + audio_data - # speech_duration = speech_audio_buffer duration - pre buffer duration - timeout buffer duration - speech_duration = (len(self._speech_audio_buffer) / 2.0) / self._audio_info.sample_rate \ - - self.chunk_size / self._audio_info.sample_rate \ + speech_buffer_size: int = len(self._speech_audio_buffer) + speech_duration: float = ( + (speech_buffer_size / 2.0) / self._audio_info.sample_rate + - self.chunk_size / self._audio_info.sample_rate - self._audio_timeout_duration - if speech_duration > self._minimum_duration: - self._pub_speech_audio.publish(AudioData(self._speech_audio_buffer)) - rospy.loginfo('published duration: {}'.format(speech_duration)) - else: - rospy.logwarn('speech duration: {} dropped'.format(speech_duration)) - self._speech_audio_buffer = audio_data - - -def main(): - - rospy.init_node('webrtcvad_ros') - node = WebRTCVADROS() - rospy.spin() - - -if __name__ == '__main__': - main() + ) + + if is_speech and speech_duration < self._maximum_duration: + self._last_speaking_time = rospy.Time.now() + else: + if rospy.Time.now() < self._last_speaking_time + rospy.Duration( + self._audio_timeout_duration + ): + rospy.logdebug("continuing...") + else: + if len(self._speech_audio_buffer) > len(audio_data): + # speech_duration = speech_audio_buffer duration - pre buffer duration - timeout buffer duration + if speech_duration > self._minimum_duration: + self._pub_speech_audio.publish( + AudioData(self._speech_audio_buffer) + ) + rospy.loginfo("published duration: {}".format(speech_duration)) + else: + rospy.logwarn( + "speech duration: {} dropped".format(speech_duration) + ) + self._speech_audio_buffer = audio_data From 26d9de528feecc9a3515498f8613b590315a230d Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Thu, 28 Mar 2024 12:47:27 +0900 Subject: [PATCH 16/19] [silero_vad_ros] support parameters --- silero_vad_ros/launch/sample.launch | 26 +++++++++++-------- silero_vad_ros/node_scripts/vad.py | 40 ++++++++++++++--------------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/silero_vad_ros/launch/sample.launch b/silero_vad_ros/launch/sample.launch index db735e1b7..69b5e3bbd 100644 --- a/silero_vad_ros/launch/sample.launch +++ b/silero_vad_ros/launch/sample.launch @@ -1,8 +1,11 @@ + + + - + - + threshold: $(arg vad_threshold) + minimum_duration: $(arg vad_minimum_duration) + maximum_duration: $(arg vad_maximum_duration) + audio_timeout_duration: $(arg vad_audio_timeout_duration) - - + + - + language: ja-JP self_cancellation: true tts_actions_names: - - sound_play - - speed_play_jp - - robotsound - - robotsound_jp + - sound_play + - speed_play_jp + - robotsound + - robotsound_jp tts_tolerance: 1.0 @@ -44,4 +50,4 @@ type="print_stt_result.py" output="screen" /> - + \ No newline at end of file diff --git a/silero_vad_ros/node_scripts/vad.py b/silero_vad_ros/node_scripts/vad.py index 3f1c48eb4..75e8f4306 100644 --- a/silero_vad_ros/node_scripts/vad.py +++ b/silero_vad_ros/node_scripts/vad.py @@ -11,33 +11,33 @@ class SileroVADROS(VADBaseNode): - def __init__(self): + def __init__(self): - model_vad, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad', - model='silero_vad', - force_reload=True) - self.model_vad = model_vad + model_vad, _ = torch.hub.load( + repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True + ) + self.model_vad = model_vad - super(SileroVADROS, self).__init__(chunk_size=1536) + super(SileroVADROS, self).__init__(chunk_size=1536) - rospy.loginfo('Initialized.') + rospy.loginfo("Initialized.") - def _get_vad_confidence(self, chunk, sampling_rate): - audio_chunk = np.frombuffer(chunk, np.int16) - abs_max = np.abs(audio_chunk).max() - audio_chunk = audio_chunk.astype('float32') - if abs_max > 0: - audio_chunk *= 1 / 32768 - audio_chunk = audio_chunk.squeeze() - return self.model_vad(torch.from_numpy(audio_chunk), sampling_rate).item() + def _get_vad_confidence(self, chunk, sampling_rate): + audio_chunk = np.frombuffer(chunk, np.int16) + abs_max = np.abs(audio_chunk).max() + audio_chunk = audio_chunk.astype("float32") + if abs_max > 0: + audio_chunk *= 1 / 32768 + audio_chunk = audio_chunk.squeeze() + return self.model_vad(torch.from_numpy(audio_chunk), sampling_rate).item() def main(): - rospy.init_node('silero_vad_ros') - node = SileroVADROS() - rospy.spin() + rospy.init_node("silero_vad_ros") + node = SileroVADROS() + rospy.spin() -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() From 0fffca755dc047dc169b51484b6c6317f7a1a74f Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 28 Mar 2024 14:22:40 +0900 Subject: [PATCH 17/19] [webrtcvad_ros] update --- .../python/webrtcvad_ros/vad_core.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index ff2cdf03e..d6d8b58cc 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -56,14 +56,15 @@ def _callback(self, msg): is_speech = True if confidence > self._threshold else False self._pub_is_speech.publish(Bool(is_speech)) + if not is_speech and len(self._speech_audio_buffer) == 0: + return + self._speech_audio_buffer = self._speech_audio_buffer + audio_data - speech_buffer_size: int = len(self._speech_audio_buffer) speech_duration: float = ( - (speech_buffer_size / 2.0) / self._audio_info.sample_rate + (len(self._speech_audio_buffer) / 2.0) / self._audio_info.sample_rate - self.chunk_size / self._audio_info.sample_rate - self._audio_timeout_duration ) - if is_speech and speech_duration < self._maximum_duration: self._last_speaking_time = rospy.Time.now() else: @@ -72,15 +73,13 @@ def _callback(self, msg): ): rospy.logdebug("continuing...") else: - if len(self._speech_audio_buffer) > len(audio_data): - # speech_duration = speech_audio_buffer duration - pre buffer duration - timeout buffer duration - if speech_duration > self._minimum_duration: - self._pub_speech_audio.publish( - AudioData(self._speech_audio_buffer) - ) - rospy.loginfo("published duration: {}".format(speech_duration)) - else: - rospy.logwarn( - "speech duration: {} dropped".format(speech_duration) - ) - self._speech_audio_buffer = audio_data + if speech_duration > self._minimum_duration: + self._pub_speech_audio.publish( + AudioData(self._speech_audio_buffer) + ) + rospy.loginfo("published duration: {}".format(speech_duration)) + else: + rospy.logwarn( + "speech duration: {} dropped".format(speech_duration) + ) + self._speech_audio_buffer = b'' From e1ee4b03362f7244a0bcf95ce36d469c690b5bef Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 28 Mar 2024 14:29:13 +0900 Subject: [PATCH 18/19] update --- silero_vad_ros/launch/sample.launch | 6 +++--- webrtcvad_ros/python/webrtcvad_ros/vad_core.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/silero_vad_ros/launch/sample.launch b/silero_vad_ros/launch/sample.launch index 69b5e3bbd..dea5568d6 100644 --- a/silero_vad_ros/launch/sample.launch +++ b/silero_vad_ros/launch/sample.launch @@ -1,8 +1,8 @@ - + - + @@ -50,4 +50,4 @@ type="print_stt_result.py" output="screen" /> - \ No newline at end of file + diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index d6d8b58cc..037184936 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -16,9 +16,9 @@ def __init__(self, chunk_size: int = 480): self._audio_buffer = b"" self._threshold = rospy.get_param("~threshold", 0.1) - self._minimum_duration = rospy.get_param("~minimum_duration", 1.0) + self._minimum_duration = rospy.get_param("~minimum_duration", 0.6) self._maximum_duration = rospy.get_param("~maximum_duration", 10.0) - self._audio_timeout_duration = rospy.get_param("~audio_timeout_duration", 0.3) + self._audio_timeout_duration = rospy.get_param("~audio_timeout_duration", 0.5) self._pub_is_speech = rospy.Publisher("~is_speeching", Bool, queue_size=1) self._pub_speech_audio = rospy.Publisher( From 5c09ccf0a34d38e84dadb14fa05919a99254bdde Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 28 Mar 2024 14:54:41 +0900 Subject: [PATCH 19/19] bugfix --- webrtcvad_ros/python/webrtcvad_ros/vad_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py index 037184936..69cca11f4 100644 --- a/webrtcvad_ros/python/webrtcvad_ros/vad_core.py +++ b/webrtcvad_ros/python/webrtcvad_ros/vad_core.py @@ -56,7 +56,9 @@ def _callback(self, msg): is_speech = True if confidence > self._threshold else False self._pub_is_speech.publish(Bool(is_speech)) - if not is_speech and len(self._speech_audio_buffer) == 0: + if not is_speech and len(self._speech_audio_buffer) <= len(audio_data): + # Store audio_data into speech_audio_buffer everytime so that catch 's' or 'th' type sound + self._speech_audio_buffer = audio_data return self._speech_audio_buffer = self._speech_audio_buffer + audio_data @@ -82,4 +84,4 @@ def _callback(self, msg): rospy.logwarn( "speech duration: {} dropped".format(speech_duration) ) - self._speech_audio_buffer = b'' + self._speech_audio_buffer = audio_data