jsk-ros-pkg · sktometometo · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023
diff --git a/silero_vad_ros/CMakeLists.txt b/silero_vad_ros/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 2.8.3)
+project(silero_vad_ros)
+
+find_program(EXIST_PYTHON3 "python3.8")
+if(NOT EXIST_PYTHON3)
+    message(WARNING "python3.8 command not found. exit without building")
+    return()
+else()
+    message(STATUS "python3.8 command found. continue building...")
+endif()
+
+find_package(catkin REQUIRED COMPONENTS catkin_virtualenv)
+
+catkin_generate_virtualenv(
+    PYTHON_INTERPRETER python3.8
+    USE_SYSTEM_PACKAGES FALSE
+    CHECK_VENV FALSE
+)
+
+catkin_package(
+)
+
+catkin_install_python(PROGRAMS
+    node_scripts/vad.py
+    DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION})
diff --git a/silero_vad_ros/README.md b/silero_vad_ros/README.md
@@ -0,0 +1,21 @@
+# silero_vad_ros
+
+This package provides VAD (Voice Activity Detection) code. It subscribes an audio topic and publish a flag if curretly speeched or not with VAD.
+This package uses [silero-vad](https://github.com/snakers4/silero-vad).
+
+## How to build
+
+```bash
+catkin build silero_vad_ros
+```
+
+## Example
+
+Please make sure your PC has a microphone.
+And then launch.
+
+```bash
+roslaunch silero_vad_ros sample.launch
+```
+
+And please talk to the microphone.
diff --git a/silero_vad_ros/launch/sample.launch b/silero_vad_ros/launch/sample.launch
@@ -0,0 +1,53 @@
+<launch>
+    <arg name="vad_threshold" default="0.5" />
+    <arg name="vad_minimum_duration" default="0.6" />
+    <arg name="vad_maximum_duration" default="10.0" />
+    <arg name="vad_audio_timeout_duration" default="0.5" />
+
+    <include file="$(find audio_capture)/launch/capture.launch">
+        <arg name="format" value="wave" />
+    </include>
+
+    <node
+        name="silero_vad_ros"
+        pkg="silero_vad_ros"
+        type="vad.py"
+        output="screen"
+        respawn="true"
+    >
+        <rosparam subst_value="true">
+            threshold: $(arg vad_threshold)
+            minimum_duration: $(arg vad_minimum_duration)
+            maximum_duration: $(arg vad_maximum_duration)
+            audio_timeout_duration: $(arg vad_audio_timeout_duration)
+        </rosparam>
+        <remap from="audio_data" to="/audio/audio" />
+        <remap from="audio_info" to="/audio/audio_info" />
+    </node>
+
+    <node
+        name="speech_to_text"
+        pkg="respeaker_ros"
+        type="speech_to_text.py"
+        output="log"
+    >
+        <remap from="audio" to="/silero_vad_ros/speech_audio" />
+        <rosparam subst_value="true">
+            language: ja-JP
+            self_cancellation: true
+            tts_actions_names:
+            - sound_play
+            - speed_play_jp
+            - robotsound
+            - robotsound_jp
+            tts_tolerance: 1.0
+        </rosparam>
+    </node>
+
+    <node
+        name="print_stt_result"
+        pkg="webrtcvad_ros"
+        type="print_stt_result.py"
+        output="screen"
+    />
+</launch>
diff --git a/silero_vad_ros/node_scripts/vad.py b/silero_vad_ros/node_scripts/vad.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import struct
+
+import numpy as np
+import rospy
+import torch
+
+from webrtcvad_ros.vad_core import VADBaseNode
+
+
+class SileroVADROS(VADBaseNode):
+
+    def __init__(self):
+
+        model_vad, _ = torch.hub.load(
+            repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True
+        )
+        self.model_vad = model_vad
+
+        super(SileroVADROS, self).__init__(chunk_size=1536)
+
+        rospy.loginfo("Initialized.")
+
+    def _get_vad_confidence(self, chunk, sampling_rate):
+        audio_chunk = np.frombuffer(chunk, np.int16)
+        abs_max = np.abs(audio_chunk).max()
+        audio_chunk = audio_chunk.astype("float32")
+        if abs_max > 0:
+            audio_chunk *= 1 / 32768
+        audio_chunk = audio_chunk.squeeze()
+        return self.model_vad(torch.from_numpy(audio_chunk), sampling_rate).item()
+
+
+def main():
+
+    rospy.init_node("silero_vad_ros")
+    node = SileroVADROS()
+    rospy.spin()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/silero_vad_ros/package.xml b/silero_vad_ros/package.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<package format="2">
+  <name>silero_vad_ros</name>
+  <version>2.1.24</version>
+  <description>This package provides a wrapper node for silero_vad. It subscribes an audio topic and
+    publish a flag if curretly speeched or not with VAD.</description>
+
+  <maintainer email="[email protected]">Kei Okada</maintainer>
+  <maintainer email="[email protected]">Koki Shinjo</maintainer>
+
+  <author email="[email protected]">Koki Shinjo</author>
+
+  <license>BSD</license>
+
+  <buildtool_depend>catkin</buildtool_depend>
+
+  <build_depend>catkin_virtualenv</build_depend>
+
+  <exec_depend>audio_common_msgs</exec_depend>
+  <exec_depend>rospy</exec_depend>
+  <exec_depend>python-webrtcvad-pip</exec_depend>
+  <exec_depend>webrtcvad_ros</exec_depend>
+
+  <export>
+    <pip_requirements>requirements.txt</pip_requirements>
+  </export>
+</package>
diff --git a/silero_vad_ros/requirements.txt b/silero_vad_ros/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchaudio
+numpy
diff --git a/webrtcvad_ros/CMakeLists.txt b/webrtcvad_ros/CMakeLists.txt
@@ -3,4 +3,6 @@ project(webrtcvad_ros)
 
 find_package(catkin REQUIRED)
 
+catkin_python_setup()
+
 catkin_package()
diff --git a/webrtcvad_ros/README.md b/webrtcvad_ros/README.md
@@ -1,6 +1,7 @@
 # webrtcvad_ros
 
-This package provides a wrapper node for [webrtcvad](https://github.com/wiseman/py-webrtcvad). It subscribes an audio topic and publish a flag if curretly speeched or not with VAD.
+This package provides VAD (Voice Activity Detection) code. It subscribes an audio topic and publish a flag if curretly speeched or not with VAD.
+This package uses [webrtcvad](https://github.com/wiseman/py-webrtcvad).
 
 ## Prerequities
 

diff --git a/webrtcvad_ros/launch/sample.launch b/webrtcvad_ros/launch/sample.launch
@@ -1,27 +1,27 @@
 <launch>
     <include file="$(find audio_capture)/launch/capture.launch">
-        <arg name="format" value="wave" />
+        <arg name="format" value="wave"/>
     </include>
 
     <node
         name="webrtcvad_ros"
         pkg="webrtcvad_ros"
-        type="webrtcvad_ros.py"
+        type="vad.py"
         output="screen"
-        >
+    >
         <rosparam>
             aggressiveness: 1
         </rosparam>
-        <remap from="audio_data" to="/audio/audio" />
-        <remap from="audio_info" to="/audio/audio_info" />
+        <remap from="audio_data" to="/audio/audio"/>
+        <remap from="audio_info" to="/audio/audio_info"/>
     </node>
 
     <node
         name="speech_to_text"
         pkg="respeaker_ros"
         type="speech_to_text.py"
         output="log"
-        >
+    >
         <remap from="audio" to="/webrtcvad_ros/speech_audio"/>
         <rosparam subst_value="true">
             language: ja-JP
@@ -35,5 +35,5 @@
         pkg="webrtcvad_ros"
         type="print_stt_result.py"
         output="screen"
-        />
+    />
 </launch>
diff --git a/webrtcvad_ros/node_scripts/vad.py b/webrtcvad_ros/node_scripts/vad.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+import rospy
+import webrtcvad
+
+from webrtcvad_ros.vad_core import VADBaseNode
+
+
+class WebRTCVADROS(VADBaseNode):
+
+  def __init__(self):
+
+    aggressiveness = rospy.get_param('~aggressiveness', 1)
+    self._vad = webrtcvad.Vad(int(aggressiveness))
+
+    super(WebRTCVADROS, self).__init__(chunk_size=480)
+
+  def _get_vad_confidence(self, chunk, sampling_rate):
+    return 1.0 if self._vad.is_speech(chunk, sampling_rate) else 0.0
+
+
+def main():
+
+  rospy.init_node('webrtcvad_ros')
+  node = WebRTCVADROS()
+  rospy.spin()
+
+
+if __name__ == '__main__':
+  main()
diff --git a/webrtcvad_ros/node_scripts/webrtcvad_ros.py b/webrtcvad_ros/node_scripts/webrtcvad_ros.py
diff --git a/webrtcvad_ros/python/webrtcvad_ros/__init__.py b/webrtcvad_ros/python/webrtcvad_ros/__init__.py