SYSTRAN · Jiltseb · Jun 9, 2023 · Jun 14, 2023 · Jun 21, 2023 · Jun 21, 2023
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 [![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
 
-# Faster Whisper transcription with CTranslate2
+# Mobius Faster Whisper transcription with CTranslate2
 
 **faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.
 
@@ -166,6 +166,35 @@ for segment in segments:
 segments, _ = model.transcribe("audio.mp3")
 segments = list(segments)  # The transcription will actually run here.
 ```
+
+### multi-segment language detection
+
+To directly use the model for improved language detection, the following code snippet can be used:
+
+```python
+from faster_whisper import WhisperModel
+model = WhisperModel("medium", device="cuda", compute_type="float16")
+language_info = model.detect_language_multi_segment("audio.mp3")
+```
+
+### Batched faster-whisper
+
+
+The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-4 Clause license. This product includes software developed by Max Bain. We modify this implementation and also added kaldi-based feature extraction. It improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference. 
+
+The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
+
+```python
+from faster_whisper import BatchedInferencePipeline
+
+model = WhisperModel("medium", device="cuda", compute_type="float16")
+batched_model = BatchedInferencePipeline(model=model)
+result = batched_model.transcribe("audio.mp3", batch_size=16)
+
+for segment, info in result:
+    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
+```
+
 ### Faster Distil-Whisper
 
 The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)

diff --git a/faster_whisper/__init__.py b/faster_whisper/__init__.py
@@ -1,12 +1,13 @@
 from faster_whisper.audio import decode_audio
-from faster_whisper.transcribe import WhisperModel
+from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
 from faster_whisper.utils import available_models, download_model, format_timestamp
 from faster_whisper.version import __version__
 
 __all__ = [
     "available_models",
     "decode_audio",
     "WhisperModel",
+    "BatchedInferencePipeline",
     "download_model",
     "format_timestamp",
     "__version__",

diff --git a/faster_whisper/feature_extractor.py b/faster_whisper/feature_extractor.py
@@ -1,4 +1,6 @@
 import numpy as np
+import torch
+import torchaudio.compliance.kaldi as ta_kaldi
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py  # noqa: E501
@@ -21,6 +23,7 @@ def __init__(
         self.mel_filters = self.get_mel_filters(
             sampling_rate, n_fft, n_mels=feature_size
         )
+        self.n_mels = feature_size
 
     def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
         # Initialize the weights
@@ -142,29 +145,53 @@ def stft(self, frames, window):
             data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
         return data.T
 
-    def __call__(self, waveform, padding=True, chunk_length=None):
+    def __call__(self, waveform, enable_ta=False, padding=True, chunk_length=None):
         """
         Compute the log-Mel spectrogram of the provided audio, gives similar results
-        whisper's original torch implementation with 1e-5 tolerance.
+        whisper's original torch implementation with 1e-5 tolerance. Additionally, faster
+        feature extraction option using kaldi fbank features are available if torchaudio is
+        available.
         """
+        if enable_ta:
+            waveform = waveform.astype(np.float32)
+
         if chunk_length is not None:
             self.n_samples = chunk_length * self.sampling_rate
             self.nb_max_frames = self.n_samples // self.hop_length
 
         if padding:
             waveform = np.pad(waveform, [(0, self.n_samples)])
 
-        window = np.hanning(self.n_fft + 1)[:-1]
-
-        frames = self.fram_wave(waveform)
-        stft = self.stft(frames, window=window)
-        magnitudes = np.abs(stft[:, :-1]) ** 2
-
-        filters = self.mel_filters
-        mel_spec = filters @ magnitudes
-
-        log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
-        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
-        log_spec = (log_spec + 4.0) / 4.0
+        if enable_ta:
+            audio = torch.from_numpy(waveform).unsqueeze(0)
+            fbank = ta_kaldi.fbank(
+                audio,
+                sample_frequency=self.sampling_rate,
+                window_type="hanning",
+                num_mel_bins=self.n_mels,
+            )
+            log_spec = fbank.numpy().T.astype(np.float32)  # ctranslate does not take 64
+
+            # normalize
+
+            # Audioset values as default mean and std for audio
+            mean_val = -4.2677393
+            std_val = 4.5689974
+            scaled_features = (log_spec - (mean_val)) / (std_val * 2)
+            log_spec = scaled_features
+
+        else:
+            window = np.hanning(self.n_fft + 1)[:-1]
+
+            frames = self.fram_wave(waveform)
+            stft = self.stft(frames, window=window)
+            magnitudes = np.abs(stft[:, :-1]) ** 2
+
+            filters = self.mel_filters
+            mel_spec = filters @ magnitudes
+
+            log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
+            log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+            log_spec = (log_spec + 4.0) / 4.0
 
         return log_spec