Merge pull request #611 from pipecat-ai/aleix/audio-filters

introduce audio filters
pipecat-ai · Nov 5, 2024 · 6082da2 · 6082da2
2 parents 126324c + 358c458
commit 6082da2
Show file tree

Hide file tree

Showing 11 changed files with 154 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,9 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Introduce output transport audio mixers. Output transport audio mixers can be
-  used, for example, to add background sounds or any other audio mixing
-  functionality before the output audio is actually written to the transport.
+- Added audio filter `NoisereduceFilter`.
+
+- Introduce input transport audio filters (`BaseAudioFilter`). Audio filters can
+  be used to remove background noises before audio is sent to VAD.
+
+- Introduce output transport audio mixers (`BaseAudioMixer`). Output transport
+  audio mixers can be used, for example, to add background sounds or any other
+  audio mixing functionality before the output audio is actually written to the
+  transport.
 
 - Added `GatedOpenAILLMContextAggregator`. This aggregator keeps the last
   received OpenAI LLM context frame and it doesn't let it through until the

diff --git a/pyproject.toml b/pyproject.toml
@@ -56,6 +56,7 @@ livekit = [ "livekit~=0.17.5", "livekit-api~=0.7.1", "tenacity~=8.5.0" ]
 lmnt = [ "lmnt~=1.1.4" ]
 local = [ "pyaudio~=0.2.14" ]
 moondream = [ "einops~=0.8.0", "timm~=1.0.8", "transformers~=4.44.0" ]
+noisereduce = [ "noisereduce~=3.0.3" ]
 openai = [ "openai~=1.50.2", "websockets~=13.1", "python-deepcompare~=1.0.1" ]
 openpipe = [ "openpipe~=4.24.0" ]
 playht = [ "pyht~=0.1.4", "websockets~=13.1" ]

diff --git a/src/pipecat/audio/filters/__init__.py b/src/pipecat/audio/filters/__init__.py
diff --git a/src/pipecat/audio/filters/base_audio_filter.py b/src/pipecat/audio/filters/base_audio_filter.py
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+from abc import ABC, abstractmethod
+
+from pipecat.frames.frames import FilterControlFrame
+
+
+class BaseAudioFilter(ABC):
+    """This is a base class for input transport audio filters. If an audio
+    filter is provided to the input transport it will be used to process audio
+    before VAD and before pushing it downstream. There are control frames to
+    update filter settings or to enable or disable the filter at runtime.
+
+    """
+
+    @abstractmethod
+    async def start(self, sample_rate: int):
+        """This will be called from the input transport when the transport is
+        started. It can be used to initialize the filter. The input transport
+        sample rate is provided so the filter can adjust to that sample rate.
+
+        """
+        pass
+
+    @abstractmethod
+    async def stop(self):
+        """This will be called from the input transport when the transport is
+        stopping.
+
+        """
+        pass
+
+    @abstractmethod
+    async def process_frame(self, frame: FilterControlFrame):
+        """This will be called when the input transport receives a
+        FilterControlFrame.
+
+        """
+        pass
+
+    @abstractmethod
+    async def filter(self, audio: bytes) -> bytes:
+        pass
diff --git a/src/pipecat/audio/filters/noisereduce_filter.py b/src/pipecat/audio/filters/noisereduce_filter.py
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2024, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import numpy as np
+
+from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
+
+from loguru import logger
+
+from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
+
+try:
+    import noisereduce as nr
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use the noisereduce filter, you need to `pip install pipecat-ai[noisereduce]`."
+    )
+    raise Exception(f"Missing module: {e}")
+
+
+class NoisereduceFilter(BaseAudioFilter):
+    def __init__(self) -> None:
+        self._filtering = True
+        self._sample_rate = 0
+
+    async def start(self, sample_rate: int):
+        self._sample_rate = sample_rate
+
+    async def stop(self):
+        pass
+
+    async def process_frame(self, frame: FilterControlFrame):
+        if isinstance(frame, FilterEnableFrame):
+            self._filtering = frame.enable
+
+    async def filter(self, audio: bytes) -> bytes:
+        if not self._filtering:
+            return audio
+
+        data = np.frombuffer(audio, dtype=np.int16)
+
+        # Add a small epsilon to avoid division by zero.
+        epsilon = 1e-10
+        data = data.astype(np.float32) + epsilon
+
+        # Noise reduction
+        reduced_noise = nr.reduce_noise(y=data, sr=self._sample_rate)
+        audio = np.clip(reduced_noise, -32768, 32767).astype(np.int16).tobytes()
+
+        return audio
diff --git a/src/pipecat/audio/mixers/base_audio_mixer.py b/src/pipecat/audio/mixers/base_audio_mixer.py
@@ -6,7 +6,7 @@
 
 from abc import ABC, abstractmethod
 
-from pipecat.frames.frames import Frame
+from pipecat.frames.frames import MixerControlFrame
 
 
 class BaseAudioMixer(ABC):
@@ -36,7 +36,7 @@ async def stop(self):
         pass
 
     @abstractmethod
-    async def process_frame(self, frame: Frame):
+    async def process_frame(self, frame: MixerControlFrame):
         """This will be called when the output transport receives a
         MixerControlFrame.
 

diff --git a/src/pipecat/audio/mixers/soundfile_mixer.py b/src/pipecat/audio/mixers/soundfile_mixer.py
@@ -12,7 +12,7 @@
 
 from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
 from pipecat.audio.utils import resample_audio
-from pipecat.frames.frames import Frame, MixerUpdateSettingsFrame, MixerEnableFrame
+from pipecat.frames.frames import MixerControlFrame, MixerUpdateSettingsFrame, MixerEnableFrame
 
 from loguru import logger
 
@@ -65,7 +65,7 @@ async def start(self, sample_rate: int):
     async def stop(self):
         pass
 
-    async def process_frame(self, frame: Frame):
+    async def process_frame(self, frame: MixerControlFrame):
         if isinstance(frame, MixerUpdateSettingsFrame):
             await self._update_settings(frame)
         elif isinstance(frame, MixerEnableFrame):

diff --git a/src/pipecat/frames/frames.py b/src/pipecat/frames/frames.py
@@ -584,9 +584,30 @@ class VADParamsUpdateFrame(ControlFrame):
     params: VADParams
 
 
+@dataclass
+class FilterControlFrame(ControlFrame):
+    """Base control frame for other audio filter frames."""
+
+    pass
+
+
+@dataclass
+class FilterUpdateSettingsFrame(FilterControlFrame):
+    """Control frame to update filter settings."""
+
+    settings: Mapping[str, Any]
+
+
+@dataclass
+class FilterEnableFrame(FilterControlFrame):
+    """Control frame to enable or disable the filter at runtime."""
+
+    enable: bool
+
+
 @dataclass
 class MixerControlFrame(ControlFrame):
-    """Base control frame for other mixer frames."""
+    """Base control frame for other audio mixer frames."""
 
     pass
 

diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py
@@ -14,6 +14,7 @@
     BotInterruptionFrame,
     CancelFrame,
     EndFrame,
+    FilterUpdateSettingsFrame,
     Frame,
     InputAudioRawFrame,
     StartFrame,
@@ -41,6 +42,9 @@ def __init__(self, params: TransportParams, **kwargs):
         self._audio_task = None
 
     async def start(self, frame: StartFrame):
+        # Start audio filter.
+        if self._params.audio_in_filter:
+            await self._params.audio_in_filter.start(self._params.audio_in_sample_rate)
         # Create audio input queue and task if needed.
         if self._params.audio_in_enabled or self._params.vad_enabled:
             self._audio_in_queue = asyncio.Queue()
@@ -52,6 +56,9 @@ async def stop(self, frame: EndFrame):
             self._audio_task.cancel()
             await self._audio_task
             self._audio_task = None
+        # Stop audio filter.
+        if self._params.audio_in_filter:
+            await self._params.audio_in_filter.stop()
 
     async def cancel(self, frame: CancelFrame):
         # Cancel and wait for the audio input task to finish.
@@ -100,6 +107,8 @@ async def process_frame(self, frame: Frame, direction: FrameDirection):
             vad_analyzer = self.vad_analyzer()
             if vad_analyzer:
                 vad_analyzer.set_params(frame.params)
+        elif isinstance(frame, FilterUpdateSettingsFrame) and self._params.audio_in_filter:
+            await self._params.audio_in_filter.process_frame(frame)
         # Other frames
         else:
             await self.push_frame(frame, direction)
@@ -165,6 +174,10 @@ async def _audio_task_handler(self):
 
                 audio_passthrough = True
 
+                # If an audio filter is available, run it before VAD.
+                if self._params.audio_in_filter:
+                    frame.audio = await self._params.audio_in_filter.filter(frame.audio)
+
                 # Check VAD and push event if necessary. We just care about
                 # changes from QUIET to SPEAKING and vice versa.
                 if self._params.vad_enabled:

diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
@@ -75,13 +75,15 @@ def __init__(self, params: TransportParams, **kwargs):
         self._bot_speaking = False
 
     async def start(self, frame: StartFrame):
+        # Start audio mixer.
         if self._params.audio_out_mixer:
             await self._params.audio_out_mixer.start(self._params.audio_out_sample_rate)
         self._create_output_tasks()
         self._create_sink_tasks()
 
     async def stop(self, frame: EndFrame):
         await self._cancel_output_tasks()
+        # Stop audio mixer.
         if self._params.audio_out_mixer:
             await self._params.audio_out_mixer.stop()
 

diff --git a/src/pipecat/transports/base_transport.py b/src/pipecat/transports/base_transport.py
@@ -13,6 +13,7 @@
 from pydantic import ConfigDict
 from pydantic.main import BaseModel
 
+from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
 from pipecat.audio.mixers.base_audio_mixer import BaseAudioMixer
 from pipecat.audio.vad.vad_analyzer import VADAnalyzer
 from pipecat.processors.frame_processor import FrameProcessor
@@ -39,6 +40,7 @@ class TransportParams(BaseModel):
     audio_in_enabled: bool = False
     audio_in_sample_rate: int = 16000
     audio_in_channels: int = 1
+    audio_in_filter: Optional[BaseAudioFilter] = None
     vad_enabled: bool = False
     vad_audio_passthrough: bool = False
     vad_analyzer: VADAnalyzer | None = None