Merge pull request #14 from mobiusml/fw_changes

Changes as per the review suggestions for Faster Whisper PR
SYSTRAN · Jun 13, 2024 · 3c22842 · 3c22842
2 parents b10b8cb + a0d3891
commit 3c22842
Show file tree

Hide file tree

Showing 8 changed files with 345 additions and 122 deletions.
diff --git a/README.md b/README.md
@@ -180,7 +180,7 @@ language_info = model.detect_language_multi_segment("audio.mp3")
 ### Batched faster-whisper
 
 
-The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-4 Clause license. This product includes software developed by Max Bain. We modify this implementation and also added kaldi-based feature extraction. It improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference. 
+The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-4 Clause license and integrates its VAD model to this library. This product includes software developed by Max Bain. We modify this implementation and also added kaldi-based feature extraction. It improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference. 
 
 The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
 
@@ -263,6 +263,7 @@ See more model and transcription options in the [`WhisperModel`](https://github.
 Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
 
 
+* [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription.
 * [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
 * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
 * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.

diff --git a/faster_whisper/assets/pyannote_vad_model.bin b/faster_whisper/assets/pyannote_vad_model.bin
diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py
@@ -15,6 +15,27 @@
 import av
 import numpy as np
 
+# Audio Hyperparameters
+
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+
+
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+
+
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
+
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
+TIME_PRECISION = 1 / TOKENS_PER_SECOND
+
 
 def decode_audio(
     input_file: Union[str, BinaryIO],

diff --git a/faster_whisper/feature_extractor.py b/faster_whisper/feature_extractor.py
@@ -163,7 +163,7 @@ def __call__(self, waveform, enable_ta=False, padding=True, chunk_length=None):
             waveform = np.pad(waveform, [(0, self.n_samples)])
 
         if enable_ta:
-            audio = torch.from_numpy(waveform).unsqueeze(0)
+            audio = torch.from_numpy(waveform).unsqueeze(0).float()
             fbank = ta_kaldi.fbank(
                 audio,
                 sample_frequency=self.sampling_rate,
@@ -177,7 +177,7 @@ def __call__(self, waveform, enable_ta=False, padding=True, chunk_length=None):
             # Audioset values as default mean and std for audio
             mean_val = -4.2677393
             std_val = 4.5689974
-            scaled_features = (log_spec - (mean_val)) / (std_val * 2)
+            scaled_features = (log_spec - mean_val) / (std_val * 2)
             log_spec = scaled_features
 
         else: