wenet-e2e · robin1001 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -21,5 +21,5 @@ lmdb==1.3.0
 onnxruntime
 soundfile==0.10.3.post1
 pypeln==0.4.9
-silero-vad @ git+https://github.com/pengzhendong/silero-vad.git
+silero-vad
 pre-commit==3.5.0
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     "kaldiio",
     "torch>=1.12.0",
     "torchaudio>=0.12.0",
-    "silero-vad @ git+https://github.com/pengzhendong/silero-vad.git",
+    "silero-vad",
 ]
 
 setup(

diff --git a/wespeaker/cli/speaker.py b/wespeaker/cli/speaker.py
@@ -17,7 +17,7 @@
 import sys
 
 import numpy as np
-from silero_vad import SileroVAD
+from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
 import torch
 import torchaudio
 import torchaudio.compliance.kaldi as kaldi
@@ -47,7 +47,7 @@ def __init__(self, model_dir: str):
         self.model = get_speaker_model(
             configs['model'])(**configs['model_args'])
         load_checkpoint(self.model, model_path)
-        self.vad = SileroVAD()
+        self.vad = load_silero_vad()
         self.table = {}
         self.resample_rate = 16000
         self.apply_vad = False
@@ -141,8 +141,8 @@ def extract_embedding(self, audio_path: str):
         if self.apply_vad:
             # TODO(Binbin Zhang): Refine the segments logic, here we just
             # suppose there is only silence at the start/end of the speech
-            segments = self.vad.get_speech_timestamps(audio_path,
-                                                      return_seconds=True)
+            wav = read_audio(audio_path)
+            segments = get_speech_timestamps(wav, self.vad, return_seconds=True)
             pcmTotal = torch.Tensor()
             if len(segments) > 0:  # remove all the silence
                 for segment in segments:

diff --git a/wespeaker/models/repvgg.py b/wespeaker/models/repvgg.py
@@ -32,8 +32,8 @@
 import wespeaker.models.pooling_layers as pooling_layers
 
 optional_groupwise_layers = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26]
-g2_map = {l: 2 for l in optional_groupwise_layers}
-g4_map = {l: 4 for l in optional_groupwise_layers}
+g2_map = dict.fromkeys(optional_groupwise_layers, 2)
+g4_map = dict.fromkeys(optional_groupwise_layers, 4)
 
 
 class SEBlock_2D(torch.nn.Module):