From b5148ec259ac6f7c123708dec0cc315546658b9a Mon Sep 17 00:00:00 2001 From: Eric Swanson Date: Thu, 19 Oct 2023 21:32:07 -0400 Subject: [PATCH 1/3] Load features onto the GPU in batches to support arbitrarily long audio --- main.py | 64 ++++++++++++++++++++++++++++++++++++++++------------- settings.py | 10 +++++++++ 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/main.py b/main.py index b7f05bb..7283965 100644 --- a/main.py +++ b/main.py @@ -89,6 +89,13 @@ torchaudio.set_audio_backend('soundfile') +# you can chunkit +def chunkit(l, n): + """Yield successive n-sized chunks from list l.""" + for i in range(0, len(l), n): + yield l[i:i + n] + + # Function to create a wav file from stream data def write_stream_wav(data, rate, bits, ch): file = io.BytesIO() @@ -234,6 +241,8 @@ async def create_datagram_endpoint(self, protocol_factory, local_addr: Tuple[str # Chatbot model max length chatbot_max_new_tokens = settings.chatbot_max_new_tokens +concurrent_gpu_chunks = settings.concurrent_gpu_chunks + # Try CUDA device = "cuda" if torch.cuda.is_available() else "cpu" @@ -260,9 +269,10 @@ async def create_datagram_endpoint(self, protocol_factory, local_addr: Tuple[str logger.info(f'CUDA: Device {cuda_dev_num} total memory: {cuda_total_memory} bytes') logger.info(f'CUDA: Device {cuda_dev_num} free memory: {cuda_free_memory} bytes') - # Disable chunking if card has less than 10GB VRAM (complete guess) + # Disable chunking if card has too little VRAM # This can still encounter out of memory errors depending on audio length - if cuda_free_memory <= 10000000000: + # XXX: we don't really need this anymore, but leaving it in to not affect short audio on small cards + if cuda_free_memory <= settings.chunking_memory_threshold: logger.warning(f'CUDA: Device {cuda_dev_num} has low memory, disabling chunking support') support_chunking = False @@ -483,10 +493,10 @@ def do_chatbot(text, max_new_tokens=chatbot_max_new_tokens, temperature=chatbot_ return output -def do_translate(whisper_model, features, batch_size, language, beam_size): +def do_translate(whisper_model, features, total_chunk_count, language, beam_size): # Set task in token format for processor task = 'translate' - logger.debug(f'WHISPER: Doing translation with {language} beam size {beam_size} and batch size {batch_size}') + logger.debug(f'WHISPER: Doing translation with {language} beam size {beam_size} and total chunk count {total_chunk_count}') processor_task = f'<|{task}|>' # Describe the task in the prompt. @@ -502,7 +512,7 @@ def do_translate(whisper_model, features, batch_size, language, beam_size): # Run generation for the 30-second window. time_start = datetime.datetime.now() - results = whisper_model.generate(features, [prompt]*batch_size, beam_size=beam_size) + results = whisper_model.generate(features, [prompt]*total_chunk_count, beam_size=beam_size) time_end = datetime.datetime.now() infer_time = time_end - time_start infer_time_milliseconds = infer_time.total_seconds() * 1000 @@ -563,14 +573,13 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = " chunks.append(log_mel_spectrogram(chunk).numpy()) strides.append(stride) mel_features = np.stack(chunks) - batch_size = len(chunks) + total_chunk_count = len(chunks) else: mel_audio = pad_or_trim(audio) mel_features = log_mel_spectrogram(mel_audio).numpy() # Ref Whisper returns shape (80, 3000) but model expects (1, 80, 3000) mel_features = np.expand_dims(mel_features, axis=0) - batch_size = 1 - features = ctranslate2.StorageView.from_array(mel_features) + total_chunk_count = 1 time_end = datetime.datetime.now() infer_time = time_end - time_start @@ -585,7 +594,12 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = " processor_language = f'<|{language}|>' if detect_language and not force_language: - results = whisper_model.detect_language(features) + # load the first mel_features batch into the GPU + # just for language detection + # important - this is named gpu_features so it will be unloaded during our batch processing later + first_mel_features = mel_features[0:1, :, :] + gpu_features = ctranslate2.StorageView.from_array(first_mel_features) + results = whisper_model.detect_language(gpu_features) language, probability = results[0][0] processor_language = language logger.debug(f"WHISPER: Detected language {language} with probability {probability}") @@ -617,7 +631,21 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = " # Whisper STEP 3 - run model time_start = datetime.datetime.now() logger.debug(f'WHISPER: Using model {model} with beam size {beam_size}') - results = whisper_model.generate(features, [prompt]*batch_size, beam_size=beam_size, return_scores=False) + + results = [] + for i, mel_features_batch in enumerate( + chunkit(mel_features, concurrent_gpu_chunks) + ): + logger.debug("Processing GPU batch %s of expected %s", i+1, len(mel_features) // concurrent_gpu_chunks + 1) + gpu_features = ctranslate2.StorageView.from_array(mel_features_batch) + results.extend( whisper_model.generate( + gpu_features, + [prompt]*len(mel_features_batch), + beam_size=beam_size, + return_scores=False, + )) + assert len(results) == total_chunk_count, "Result length doesn't match expected total_chunk_count" + time_end = datetime.datetime.now() infer_time = time_end - time_start infer_time_milliseconds = infer_time.total_seconds() * 1000 @@ -626,7 +654,7 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = " time_start = datetime.datetime.now() if use_chunking: assert strides, 'strides needed to compute final tokens when chunking' - tokens = [(results[i].sequences_ids[0], strides[i]) for i in range(batch_size)] + tokens = [(results[i].sequences_ids[0], strides[i]) for i in range(total_chunk_count)] tokens = find_longest_common_sequence(tokens, models.whisper_processor.tokenizer) else: tokens = results[0].sequences_ids[0] @@ -641,9 +669,14 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = " pattern = re.compile("[A-Za-z0-9]+", ) language = pattern.findall(language)[0] - if translate: + # the gpu_features were loaded above when we ran the initial whisper model + # so we don't need to reload them to the GPU here + if translate and len(total_chunk_count) > concurrent_gpu_chunks: + logger.warning("Cannot translate because too much audio for the GPU memory") + translation = None + elif translate: logger.debug(f'WHISPER: Detected non-preferred language {language}, translating') - translation = do_translate(whisper_model, features, batch_size, language, beam_size=beam_size) + translation = do_translate(whisper_model, gpu_features, total_chunk_count, language, beam_size=beam_size) # Strip tokens from translation output - brittle but works right now translation = translation.split('>')[2] translation = translation.strip() @@ -1175,7 +1208,6 @@ async def willow(request: Request, response: Response, model: Optional[str] = wh channel = "1" codec = "pcm" - body = b'' sample_rate = request.headers.get('x-audio-sample-rate', '').lower() bits = request.headers.get('x-audio-bits', '').lower() channel = request.headers.get('x-audio-channel', '').lower() @@ -1188,8 +1220,10 @@ async def willow(request: Request, response: Response, model: Optional[str] = wh if willow_id: logger.debug(f"WILLOW: Got Willow ID {willow_id}") + body = [] async for chunk in request.stream(): - body += chunk + body.append(chunk) + body = b''.join(body) try: if codec == "pcm": diff --git a/settings.py b/settings.py index 210de4e..d4d63c3 100644 --- a/settings.py +++ b/settings.py @@ -32,6 +32,16 @@ class APISettings(BaseSettings): # Enable chunking support support_chunking: bool = True + # There is really no reason to disable chunking anymore + # But if you still want to, you can set this threshold higher + # current value is equivalent of 4GB GPUs + chunking_memory_threshold: int = 3798205849 + + # Maximum number of chunks that are loaded into the GPU at once + # This will need to be tweaked based on GPU ram + # 8GB GPUs should support at least 2 chunks so starting with that + concurrent_gpu_chunks: int = 2 + # Enable TTS support_tts: bool = True From 4c52bafe03bc5c4ec79b41628097af03d99fd37b Mon Sep 17 00:00:00 2001 From: Eric Swanson Date: Tue, 24 Oct 2023 11:29:44 -0400 Subject: [PATCH 2/3] Flake8 changes --- main.py | 12 ++++++------ settings.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 7283965..6ee2a20 100644 --- a/main.py +++ b/main.py @@ -90,10 +90,10 @@ # you can chunkit -def chunkit(l, n): - """Yield successive n-sized chunks from list l.""" - for i in range(0, len(l), n): - yield l[i:i + n] +def chunkit(lst, num): + """Yield successive num-sized chunks from list lst.""" + for i in range(0, len(lst), num): + yield lst[i:i + num] # Function to create a wav file from stream data @@ -496,7 +496,7 @@ def do_chatbot(text, max_new_tokens=chatbot_max_new_tokens, temperature=chatbot_ def do_translate(whisper_model, features, total_chunk_count, language, beam_size): # Set task in token format for processor task = 'translate' - logger.debug(f'WHISPER: Doing translation with {language} beam size {beam_size} and total chunk count {total_chunk_count}') + logger.debug(f'WHISPER: Doing translation with {language}, beam size {beam_size}, chunk count {total_chunk_count}') processor_task = f'<|{task}|>' # Describe the task in the prompt. @@ -638,7 +638,7 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = " ): logger.debug("Processing GPU batch %s of expected %s", i+1, len(mel_features) // concurrent_gpu_chunks + 1) gpu_features = ctranslate2.StorageView.from_array(mel_features_batch) - results.extend( whisper_model.generate( + results.extend(whisper_model.generate( gpu_features, [prompt]*len(mel_features_batch), beam_size=beam_size, diff --git a/settings.py b/settings.py index d4d63c3..b5c7972 100644 --- a/settings.py +++ b/settings.py @@ -5,7 +5,7 @@ class APISettings(BaseSettings): # Project metadata - name: str= "Willow Inference Server" + name: str = "Willow Inference Server" description: str = "High Performance Language Inference API" version: str = "1.0" From 50fa0a78acfcd2e2186603b330ac0c489900530f Mon Sep 17 00:00:00 2001 From: Kristian Kielhofner Date: Tue, 24 Oct 2023 10:55:05 -0500 Subject: [PATCH 3/3] Minor tweaks for GPU batching support --- nginx/nginx.conf | 7 +++++-- settings.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/nginx/nginx.conf b/nginx/nginx.conf index b36ac90..afc1aae 100644 --- a/nginx/nginx.conf +++ b/nginx/nginx.conf @@ -36,6 +36,8 @@ http { # Websocket support proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; + # Support very long sessions for GPU batching of large files + proxy_read_timeout 1800; # Use HTTP 1.1 keepalives to backend gunicorn upstream keepalive-wis { @@ -44,8 +46,9 @@ http { keepalive_timeout 3600s; } - # Increase max client body size for ASR file uploads, etc. 100MB matches Cloudflare - client_max_body_size 100M; + # Increase max client body size for ASR file uploads, etc. + # Default to very large to support GPU batching of long audio files. + client_max_body_size 2G; server { listen 19001; diff --git a/settings.py b/settings.py index b5c7972..5360c3a 100644 --- a/settings.py +++ b/settings.py @@ -38,7 +38,7 @@ class APISettings(BaseSettings): chunking_memory_threshold: int = 3798205849 # Maximum number of chunks that are loaded into the GPU at once - # This will need to be tweaked based on GPU ram + # This will need to be tweaked based on GPU ram and model used. # 8GB GPUs should support at least 2 chunks so starting with that concurrent_gpu_chunks: int = 2