From b5148ec259ac6f7c123708dec0cc315546658b9a Mon Sep 17 00:00:00 2001
From: Eric Swanson <eswanson@alloscomp.com>
Date: Thu, 19 Oct 2023 21:32:07 -0400
Subject: [PATCH 1/3] Load features onto the GPU in batches to support
 arbitrarily long audio

---
 main.py     | 64 ++++++++++++++++++++++++++++++++++++++++-------------
 settings.py | 10 +++++++++
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/main.py b/main.py
index b7f05bb..7283965 100644
--- a/main.py
+++ b/main.py
@@ -89,6 +89,13 @@
 torchaudio.set_audio_backend('soundfile')
 
 
+# you can chunkit
+def chunkit(l, n):
+    """Yield successive n-sized chunks from list l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
 # Function to create a wav file from stream data
 def write_stream_wav(data, rate, bits, ch):
     file = io.BytesIO()
@@ -234,6 +241,8 @@ async def create_datagram_endpoint(self, protocol_factory, local_addr: Tuple[str
 # Chatbot model max length
 chatbot_max_new_tokens = settings.chatbot_max_new_tokens
 
+concurrent_gpu_chunks = settings.concurrent_gpu_chunks
+
 # Try CUDA
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -260,9 +269,10 @@ async def create_datagram_endpoint(self, protocol_factory, local_addr: Tuple[str
         logger.info(f'CUDA: Device {cuda_dev_num} total memory: {cuda_total_memory} bytes')
         logger.info(f'CUDA: Device {cuda_dev_num} free memory: {cuda_free_memory} bytes')
 
-        # Disable chunking if card has less than 10GB VRAM (complete guess)
+        # Disable chunking if card has too little VRAM
         # This can still encounter out of memory errors depending on audio length
-        if cuda_free_memory <= 10000000000:
+        # XXX: we don't really need this anymore, but leaving it in to not affect short audio on small cards
+        if cuda_free_memory <= settings.chunking_memory_threshold:
             logger.warning(f'CUDA: Device {cuda_dev_num} has low memory, disabling chunking support')
             support_chunking = False
 
@@ -483,10 +493,10 @@ def do_chatbot(text, max_new_tokens=chatbot_max_new_tokens, temperature=chatbot_
     return output
 
 
-def do_translate(whisper_model, features, batch_size, language, beam_size):
+def do_translate(whisper_model, features, total_chunk_count, language, beam_size):
     # Set task in token format for processor
     task = 'translate'
-    logger.debug(f'WHISPER: Doing translation with {language} beam size {beam_size} and batch size {batch_size}')
+    logger.debug(f'WHISPER: Doing translation with {language} beam size {beam_size} and total chunk count {total_chunk_count}')
     processor_task = f'<|{task}|>'
 
     # Describe the task in the prompt.
@@ -502,7 +512,7 @@ def do_translate(whisper_model, features, batch_size, language, beam_size):
 
     # Run generation for the 30-second window.
     time_start = datetime.datetime.now()
-    results = whisper_model.generate(features, [prompt]*batch_size, beam_size=beam_size)
+    results = whisper_model.generate(features, [prompt]*total_chunk_count, beam_size=beam_size)
     time_end = datetime.datetime.now()
     infer_time = time_end - time_start
     infer_time_milliseconds = infer_time.total_seconds() * 1000
@@ -563,14 +573,13 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = "
             chunks.append(log_mel_spectrogram(chunk).numpy())
             strides.append(stride)
         mel_features = np.stack(chunks)
-        batch_size = len(chunks)
+        total_chunk_count = len(chunks)
     else:
         mel_audio = pad_or_trim(audio)
         mel_features = log_mel_spectrogram(mel_audio).numpy()
         # Ref Whisper returns shape (80, 3000) but model expects (1, 80, 3000)
         mel_features = np.expand_dims(mel_features, axis=0)
-        batch_size = 1
-    features = ctranslate2.StorageView.from_array(mel_features)
+        total_chunk_count = 1
 
     time_end = datetime.datetime.now()
     infer_time = time_end - time_start
@@ -585,7 +594,12 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = "
     processor_language = f'<|{language}|>'
 
     if detect_language and not force_language:
-        results = whisper_model.detect_language(features)
+        # load the first mel_features batch into the GPU
+        # just for language detection
+        # important - this is named gpu_features so it will be unloaded during our batch processing later
+        first_mel_features = mel_features[0:1, :, :]
+        gpu_features = ctranslate2.StorageView.from_array(first_mel_features)
+        results = whisper_model.detect_language(gpu_features)
         language, probability = results[0][0]
         processor_language = language
         logger.debug(f"WHISPER: Detected language {language} with probability {probability}")
@@ -617,7 +631,21 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = "
     # Whisper STEP 3 - run model
     time_start = datetime.datetime.now()
     logger.debug(f'WHISPER: Using model {model} with beam size {beam_size}')
-    results = whisper_model.generate(features, [prompt]*batch_size, beam_size=beam_size, return_scores=False)
+
+    results = []
+    for i, mel_features_batch in enumerate(
+        chunkit(mel_features, concurrent_gpu_chunks)
+    ):
+        logger.debug("Processing GPU batch %s of expected %s", i+1, len(mel_features) // concurrent_gpu_chunks + 1)
+        gpu_features = ctranslate2.StorageView.from_array(mel_features_batch)
+        results.extend( whisper_model.generate(
+            gpu_features,
+            [prompt]*len(mel_features_batch),
+            beam_size=beam_size,
+            return_scores=False,
+        ))
+    assert len(results) == total_chunk_count, "Result length doesn't match expected total_chunk_count"
+
     time_end = datetime.datetime.now()
     infer_time = time_end - time_start
     infer_time_milliseconds = infer_time.total_seconds() * 1000
@@ -626,7 +654,7 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = "
     time_start = datetime.datetime.now()
     if use_chunking:
         assert strides, 'strides needed to compute final tokens when chunking'
-        tokens = [(results[i].sequences_ids[0], strides[i]) for i in range(batch_size)]
+        tokens = [(results[i].sequences_ids[0], strides[i]) for i in range(total_chunk_count)]
         tokens = find_longest_common_sequence(tokens, models.whisper_processor.tokenizer)
     else:
         tokens = results[0].sequences_ids[0]
@@ -641,9 +669,14 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = "
     pattern = re.compile("[A-Za-z0-9]+", )
     language = pattern.findall(language)[0]
 
-    if translate:
+    # the gpu_features were loaded above when we ran the initial whisper model
+    # so we don't need to reload them to the GPU here
+    if translate and len(total_chunk_count) > concurrent_gpu_chunks:
+        logger.warning("Cannot translate because too much audio for the GPU memory")
+        translation = None
+    elif translate:
         logger.debug(f'WHISPER: Detected non-preferred language {language}, translating')
-        translation = do_translate(whisper_model, features, batch_size, language, beam_size=beam_size)
+        translation = do_translate(whisper_model, gpu_features, total_chunk_count, language, beam_size=beam_size)
         # Strip tokens from translation output - brittle but works right now
         translation = translation.split('>')[2]
         translation = translation.strip()
@@ -1175,7 +1208,6 @@ async def willow(request: Request, response: Response, model: Optional[str] = wh
     channel = "1"
     codec = "pcm"
 
-    body = b''
     sample_rate = request.headers.get('x-audio-sample-rate', '').lower()
     bits = request.headers.get('x-audio-bits', '').lower()
     channel = request.headers.get('x-audio-channel', '').lower()
@@ -1188,8 +1220,10 @@ async def willow(request: Request, response: Response, model: Optional[str] = wh
     if willow_id:
         logger.debug(f"WILLOW: Got Willow ID {willow_id}")
 
+    body = []
     async for chunk in request.stream():
-        body += chunk
+        body.append(chunk)
+    body = b''.join(body)
 
     try:
         if codec == "pcm":
diff --git a/settings.py b/settings.py
index 210de4e..d4d63c3 100644
--- a/settings.py
+++ b/settings.py
@@ -32,6 +32,16 @@ class APISettings(BaseSettings):
     # Enable chunking support
     support_chunking: bool = True
 
+    # There is really no reason to disable chunking anymore
+    # But if you still want to, you can set this threshold higher
+    # current value is equivalent of 4GB GPUs
+    chunking_memory_threshold: int = 3798205849
+
+    # Maximum number of chunks that are loaded into the GPU at once
+    # This will need to be tweaked based on GPU ram
+    # 8GB GPUs should support at least 2 chunks so starting with that
+    concurrent_gpu_chunks: int = 2
+
     # Enable TTS
     support_tts: bool = True
 

From 4c52bafe03bc5c4ec79b41628097af03d99fd37b Mon Sep 17 00:00:00 2001
From: Eric Swanson <eswanson@alloscomp.com>
Date: Tue, 24 Oct 2023 11:29:44 -0400
Subject: [PATCH 2/3] Flake8 changes

---
 main.py     | 12 ++++++------
 settings.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 7283965..6ee2a20 100644
--- a/main.py
+++ b/main.py
@@ -90,10 +90,10 @@
 
 
 # you can chunkit
-def chunkit(l, n):
-    """Yield successive n-sized chunks from list l."""
-    for i in range(0, len(l), n):
-        yield l[i:i + n]
+def chunkit(lst, num):
+    """Yield successive num-sized chunks from list lst."""
+    for i in range(0, len(lst), num):
+        yield lst[i:i + num]
 
 
 # Function to create a wav file from stream data
@@ -496,7 +496,7 @@ def do_chatbot(text, max_new_tokens=chatbot_max_new_tokens, temperature=chatbot_
 def do_translate(whisper_model, features, total_chunk_count, language, beam_size):
     # Set task in token format for processor
     task = 'translate'
-    logger.debug(f'WHISPER: Doing translation with {language} beam size {beam_size} and total chunk count {total_chunk_count}')
+    logger.debug(f'WHISPER: Doing translation with {language}, beam size {beam_size}, chunk count {total_chunk_count}')
     processor_task = f'<|{task}|>'
 
     # Describe the task in the prompt.
@@ -638,7 +638,7 @@ def do_whisper(audio_file, model: str, beam_size: int = beam_size, task: str = "
     ):
         logger.debug("Processing GPU batch %s of expected %s", i+1, len(mel_features) // concurrent_gpu_chunks + 1)
         gpu_features = ctranslate2.StorageView.from_array(mel_features_batch)
-        results.extend( whisper_model.generate(
+        results.extend(whisper_model.generate(
             gpu_features,
             [prompt]*len(mel_features_batch),
             beam_size=beam_size,
diff --git a/settings.py b/settings.py
index d4d63c3..b5c7972 100644
--- a/settings.py
+++ b/settings.py
@@ -5,7 +5,7 @@
 
 class APISettings(BaseSettings):
     # Project metadata
-    name: str= "Willow Inference Server"
+    name: str = "Willow Inference Server"
     description: str = "High Performance Language Inference API"
     version: str = "1.0"
 

From 50fa0a78acfcd2e2186603b330ac0c489900530f Mon Sep 17 00:00:00 2001
From: Kristian Kielhofner <kris@tovera.com>
Date: Tue, 24 Oct 2023 10:55:05 -0500
Subject: [PATCH 3/3] Minor tweaks for GPU batching support

---
 nginx/nginx.conf | 7 +++++--
 settings.py      | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/nginx/nginx.conf b/nginx/nginx.conf
index b36ac90..afc1aae 100644
--- a/nginx/nginx.conf
+++ b/nginx/nginx.conf
@@ -36,6 +36,8 @@ http {
     # Websocket support
     proxy_set_header Upgrade $http_upgrade;
     proxy_set_header Connection "upgrade";
+    # Support very long sessions for GPU batching of large files
+    proxy_read_timeout 1800;
 
     # Use HTTP 1.1 keepalives to backend gunicorn
     upstream keepalive-wis {
@@ -44,8 +46,9 @@ http {
         keepalive_timeout 3600s;
     }
 
-    # Increase max client body size for ASR file uploads, etc. 100MB matches Cloudflare
-    client_max_body_size 100M;
+    # Increase max client body size for ASR file uploads, etc.
+    # Default to very large to support GPU batching of long audio files.
+    client_max_body_size 2G;
 
     server {
         listen 19001;
diff --git a/settings.py b/settings.py
index b5c7972..5360c3a 100644
--- a/settings.py
+++ b/settings.py
@@ -38,7 +38,7 @@ class APISettings(BaseSettings):
     chunking_memory_threshold: int = 3798205849
 
     # Maximum number of chunks that are loaded into the GPU at once
-    # This will need to be tweaked based on GPU ram
+    # This will need to be tweaked based on GPU ram and model used.
     # 8GB GPUs should support at least 2 chunks so starting with that
     concurrent_gpu_chunks: int = 2