From 1becddb5908023c529bcdd0f0285f08410cacefd Mon Sep 17 00:00:00 2001 From: ben91lin Date: Tue, 4 Jun 2024 12:49:14 +0800 Subject: [PATCH] Improve language detection when using clip_timestamps --- faster_whisper/transcribe.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index ce7fa999..f8ff96e0 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -370,15 +370,24 @@ def transcribe( or language_detection_segments < 1 ): language_detection_segments = 1 - seek = 0 + if isinstance(clip_timestamps, str): + start_timestamp = float(clip_timestamps.split(",")[0]) + else: + start_timestamp = clip_timestamps[0] detected_language_info = {} + seek = int( + start_timestamp * self.feature_extractor.sampling_rate + // self.feature_extractor.hop_length + ) content_frames = ( features.shape[-1] - self.feature_extractor.nb_max_frames ) + end_frames = ( + seek + self.feature_extractor.nb_max_frames * language_detection_segments + ) while ( seek <= content_frames - and seek - < self.feature_extractor.nb_max_frames * language_detection_segments + and seek < end_frames ): segment = features[ :, seek : seek + self.feature_extractor.nb_max_frames