stanford-crfm · andyzorigin · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023 · Aug 29, 2023
diff --git a/scripts/data_overlap/compute_data_overlap_metrics.py b/scripts/data_overlap/compute_data_overlap_metrics.py
@@ -27,6 +27,7 @@
 
 PART_INPUT: str = "input"
 PART_REF: str = "references"
+PART_INTERSECT: str = "intersect"
 
 
 # type alias for overlap-related data structures
@@ -106,6 +107,24 @@ def create_ngram_index(
                         ngram_index[n][reference_ngram].add(
                             EntryDataOverlapKey(stats_key=stats_key, instance_id=id, part=PART_REF)
                         )
+
+                # concatenate the last n-1 tokens of input and the first n-1 tokens
+                # of reference and compute n-grams on this "interesection token sequence"
+                # for instance: input = ["is 2+2 4 true or false"] reference = ["true"]
+                # the intersection is the 5-gram ["4 true or false true"]
+                # (which is formed from the input 4-gram [4 true or false] and the reference 1-gram [true])
+                input_end_tokens = input_tokens[-(n - 1) :]
+                for reference in instance.references:
+                    reference_unigrams = tokenizer.tokenize(reference)
+                    reference_start_tokens = reference_unigrams[: n - 1]
+                    intersection_tokens = input_end_tokens + reference_start_tokens
+                    for intersection_ngram in ngrams(intersection_tokens, n):
+                        if intersection_ngram not in ngram_index[n]:
+                            ngram_index[n][intersection_ngram] = set()
+                        ngram_index[n][intersection_ngram].add(
+                            EntryDataOverlapKey(stats_key=stats_key, instance_id=id, part=PART_INTERSECT)
+                        )
+
     return ngram_index
 
 
@@ -116,6 +135,7 @@ def compute_all_data_overlap(
     tokenizer: LightTokenizer,
     stats_key_to_input_ids: DefaultDict[DataOverlapStatsKey, Set[str]],
     stats_key_to_reference_ids: DefaultDict[DataOverlapStatsKey, Set[str]],
+    stats_key_to_intersection_ids: DefaultDict[DataOverlapStatsKey, Set[str]],
     entry_overlap_key_to_ngram_counts: DefaultDict[EntryDataOverlapKey, DefaultDict[str, int]],
     output_ngrams: bool,
 ) -> None:
@@ -140,6 +160,7 @@ def compute_all_data_overlap(
             tokenizer=tokenizer,
             stats_key_to_input_ids=stats_key_to_input_ids,
             stats_key_to_reference_ids=stats_key_to_reference_ids,
+            stats_key_to_intersection_ids=stats_key_to_intersection_ids,
             entry_overlap_key_to_ngram_counts=entry_overlap_key_to_ngram_counts,
             output_ngrams=output_ngrams,
         )
@@ -151,6 +172,7 @@ def compute_document_data_overlap(
     tokenizer: LightTokenizer,
     stats_key_to_input_ids: DefaultDict[DataOverlapStatsKey, Set[str]],
     stats_key_to_reference_ids: DefaultDict[DataOverlapStatsKey, Set[str]],
+    stats_key_to_intersection_ids: DefaultDict[DataOverlapStatsKey, Set[str]],
     entry_overlap_key_to_ngram_counts: DefaultDict[EntryDataOverlapKey, DefaultDict[str, int]],
     output_ngrams: bool,
 ) -> None:
@@ -182,6 +204,8 @@ def compute_document_data_overlap(
                         stats_key_to_input_ids[entry_overlap_key.stats_key].add(id)
                     elif part == PART_REF:
                         stats_key_to_reference_ids[entry_overlap_key.stats_key].add(id)
+                    elif part == PART_INTERSECT:
+                        stats_key_to_intersection_ids[entry_overlap_key.stats_key].add(id)
                     if output_ngrams:
                         entry_overlap_key_to_ngram_counts[entry_overlap_key][document_ngram] += 1
 
@@ -214,6 +238,7 @@ def compute_document_data_overlap(
     # DataOverlapStatsKey -> Set[str] for ids
     stats_key_to_input_ids: DefaultDict[DataOverlapStatsKey, Set] = defaultdict(set)
     stats_key_to_reference_ids: DefaultDict[DataOverlapStatsKey, Set] = defaultdict(set)
+    stats_key_to_intersection_ids: DefaultDict[DataOverlapStatsKey, Set] = defaultdict(set)
 
     entry_overlap_key_to_ngram_counts: DefaultDict[EntryDataOverlapKey, DefaultDict[str, int]] = defaultdict(
         lambda: defaultdict(int)
@@ -232,6 +257,7 @@ def compute_document_data_overlap(
                 tokenizer=tokenizer,
                 stats_key_to_input_ids=stats_key_to_input_ids,
                 stats_key_to_reference_ids=stats_key_to_reference_ids,
+                stats_key_to_intersection_ids=stats_key_to_intersection_ids,
                 entry_overlap_key_to_ngram_counts=entry_overlap_key_to_ngram_counts,
                 output_ngrams=not args.no_output_ngrams,
             )
@@ -255,6 +281,7 @@ def compute_document_data_overlap(
             data_overlap_stats_key=stats_key,
             instance_ids_with_overlapping_input=sorted(stats_key_to_input_ids[stats_key]),
             instance_ids_with_overlapping_reference=sorted(stats_key_to_reference_ids[stats_key]),
+            instance_ids_with_overlapping_intersection=sorted(stats_key_to_intersection_ids[stats_key]),
             num_instances=count,
         )
         all_data_overlap_stats.append(data_overlap_stats)

diff --git a/scripts/data_overlap/data_overlap_spec.py b/scripts/data_overlap/data_overlap_spec.py
@@ -66,6 +66,8 @@ class DataOverlapStats:
 
     instance_ids_with_overlapping_reference: List[str]
 
+    instance_ids_with_overlapping_intersection: List[str]
+
 
 @dataclass(frozen=True)
 class EntryDataOverlapKey: