Attempt to fix a very occasional hang in datasets map/filter (#725)

* dont use lambdas * tokenizer building distributed safety
mosaicml · Nov 9, 2023 · efaa545 · efaa545
1 parent ab9b938
commit efaa545
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 4 deletions.
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -362,8 +362,12 @@ def dataset_mapper(example: Dict):
             num_proc=num_cpus_to_use,
             desc='Tokenizing dataset',
         )
+
+        def filter_long_prompts(example: Dict) -> bool:
+            return len(example['input_ids']) < max_seq_len
+
         prompt_length_filtered_dataset = tokenized_dataset.filter(
-            lambda example: len(example['input_ids']) < max_seq_len,
+            filter_long_prompts,
             num_proc=num_cpus_to_use,
             desc='Filtering out long prompts',
         )
@@ -376,10 +380,14 @@ def dataset_mapper(example: Dict):
             )
 
         pad_token_id = tokenizer.pad_token_id
+
+        def filter_empty_examples(example: Dict) -> bool:
+            return len(example['input_ids']) > 0 and len(
+                example['labels']) > 0 and any(
+                    token_id != pad_token_id for token_id in example['labels'])
+
         empty_examples_dropped_dataset = prompt_length_filtered_dataset.filter(
-            lambda example: len(example['input_ids']) > 0 and len(example[
-                'labels']) > 0 and any(token_id != pad_token_id
-                                       for token_id in example['labels']),
+            filter_empty_examples,
             num_proc=num_cpus_to_use,
             desc='Filtering out empty examples')
 

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
@@ -188,6 +188,12 @@ def build_tokenizer(
     os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 
+    signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup'
+
+    # Make sure the tokenizer files are downloaded and cached first by local rank 0
+    with dist.local_rank_zero_download_and_wait(signal_file_path):
+        pass
+
     if tokenizer_name.startswith('tiktoken'):
         tokenizer = TiktokenTokenizerWrapper(**tokenizer_kwargs)
     else:
@@ -202,6 +208,15 @@ def build_tokenizer(
             int(1e30),
         )
 
+    if dist.get_local_rank() == 0:
+        with open(signal_file_path, 'wb') as f:
+            f.write(b'local_rank0_completed_tokenizer_setup')
+
+    dist.barrier()
+
+    if dist.get_local_rank() == 0:
+        os.remove(signal_file_path)
+
     return tokenizer