Merge pull request #61 from wuhongsheng/DeepFilterNet
feat:add DeepFilterNet for speech enhancement to obtain clear speech …
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import torchaudio
|
||||
from VAD.vad_iterator import VADIterator
|
||||
from baseHandler import BaseHandler
|
||||
import numpy as np
|
||||
@@ -5,7 +6,7 @@ import torch
|
||||
from rich.console import Console
|
||||
|
||||
from utils.utils import int2float
|
||||
|
||||
from df.enhance import enhance, init_df
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -28,6 +29,7 @@ class VADHandler(BaseHandler):
|
||||
min_speech_ms=500,
|
||||
max_speech_ms=float("inf"),
|
||||
speech_pad_ms=30,
|
||||
audio_enhancement=True
|
||||
):
|
||||
self.should_listen = should_listen
|
||||
self.sample_rate = sample_rate
|
||||
@@ -42,6 +44,9 @@ class VADHandler(BaseHandler):
|
||||
min_silence_duration_ms=min_silence_ms,
|
||||
speech_pad_ms=speech_pad_ms,
|
||||
)
|
||||
self.audio_enhancement = audio_enhancement
|
||||
if audio_enhancement:
|
||||
self.enhanced_model, self.df_state, _ = init_df()
|
||||
|
||||
def process(self, audio_chunk):
|
||||
audio_int16 = np.frombuffer(audio_chunk, dtype=np.int16)
|
||||
@@ -58,4 +63,13 @@ class VADHandler(BaseHandler):
|
||||
else:
|
||||
self.should_listen.clear()
|
||||
logger.debug("Stop listening")
|
||||
if self.audio_enhancement:
|
||||
if self.sample_rate != self.df_state.sr():
|
||||
audio_float32 = torchaudio.functional.resample(torch.from_numpy(array),orig_freq=self.sample_rate,
|
||||
new_freq=self.df_state.sr())
|
||||
enhanced = enhance(self.enhanced_model, self.df_state, audio_float32.unsqueeze(0))
|
||||
enhanced = torchaudio.functional.resample(enhanced, orig_freq=self.df_state.sr(),new_freq=self.sample_rate)
|
||||
else:
|
||||
enhanced = enhance(self.enhanced_model, self.df_state, audio_float32)
|
||||
array = enhanced.numpy().squeeze()
|
||||
yield array
|
||||
|
||||
@@ -39,3 +39,9 @@ class VADHandlerArguments:
|
||||
"help": "Amount of padding added to the beginning and end of detected speech segments. Measured in milliseconds. Default is 250 ms."
|
||||
},
|
||||
)
|
||||
audio_enhancement:bool = field(
|
||||
default=True,
|
||||
metadata={
|
||||
"help": "improves sound quality by applying techniques like noise reduction, equalization, and echo cancellation. Default is True."
|
||||
},
|
||||
)
|
||||
|
||||
@@ -5,4 +5,5 @@ torch==2.4.0
|
||||
sounddevice==0.5.0
|
||||
ChatTTS>=0.1.1
|
||||
funasr>=1.1.6
|
||||
modelscope>=1.17.1
|
||||
modelscope>=1.17.1
|
||||
deepfilternet>=0.5.6
|
||||
|
||||
@@ -8,3 +8,5 @@ mlx-lm>=0.14.0
|
||||
ChatTTS>=0.1.1
|
||||
funasr>=1.1.6
|
||||
modelscope>=1.17.1
|
||||
deepfilternet>=0.5.6
|
||||
|
||||
|
||||
Reference in New Issue
Block a user