diff --git a/scripts/data_overlap/README.md b/scripts/data_overlap/README.md index c573c29b44..ad4b542765 100644 --- a/scripts/data_overlap/README.md +++ b/scripts/data_overlap/README.md @@ -32,6 +32,7 @@ For instance, you can call this with The Pile, e.g. have: output_stats = arbitrary output file name, e.g. "output_stats" input_format = the_pile +If you don't want to output the ngrams that are overlapping in test set to a separate "{output_stats}_ngrams" file, you can pass --no-output-ngrams. There are additional optional args: --normalization default diff --git a/scripts/data_overlap/common/arguments.py b/scripts/data_overlap/common/arguments.py index 09e5f4106f..ae09cdd6e8 100644 --- a/scripts/data_overlap/common/arguments.py +++ b/scripts/data_overlap/common/arguments.py @@ -13,7 +13,7 @@ def get_data_overlap_args() -> Any: required=True, help="The format of your input file for your training data, e.g. raw, custom, the_pile", ) - parser.add_argument("--output-ngrams", type=bool, default=False, help="Whether to output ngrams") + parser.add_argument("--no-output-ngrams", type=bool, default=False, help="Pass to not output ngrams") parser.add_argument( "--tags", type=str, diff --git a/scripts/data_overlap/compute_data_overlap_metrics.py b/scripts/data_overlap/compute_data_overlap_metrics.py index 3790adc693..05c496992d 100644 --- a/scripts/data_overlap/compute_data_overlap_metrics.py +++ b/scripts/data_overlap/compute_data_overlap_metrics.py @@ -233,10 +233,10 @@ def compute_document_data_overlap( stats_key_to_input_ids=stats_key_to_input_ids, stats_key_to_reference_ids=stats_key_to_reference_ids, entry_overlap_key_to_ngram_counts=entry_overlap_key_to_ngram_counts, - output_ngrams=args.output_ngrams, + output_ngrams=not args.no_output_ngrams, ) - if args.output_ngrams: + if not args.no_output_ngrams: all_entry_overlap_ngrams = [] with open(f"{args.output_stats}_ngrams", "w") as f: for entry_overlap_key in entry_overlap_key_to_ngram_counts: