diff --git a/scripts/data_overlap/load_documents.py b/scripts/data_overlap/load_documents.py index 3e173fc5136..82bc17c186d 100644 --- a/scripts/data_overlap/load_documents.py +++ b/scripts/data_overlap/load_documents.py @@ -115,7 +115,7 @@ def get_the_pile_document_iterator(file_path: str) -> Iterator[str]: """ with open(file_path, "r") as f: import random - if random.random() < 1 / 1000: + if random.random() < 1 / 10: for line in f: yield json.loads(line)["text"]