-
Notifications
You must be signed in to change notification settings - Fork 4
/
convert_corpus.py
63 lines (55 loc) · 2.05 KB
/
convert_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
from collections import Counter
import argparse
import pathlib
import smart_open
def main(args):
lengths = Counter()
with smart_open.open(args.input_path, "r", encoding="utf-8") as fp_in:
with smart_open.open(args.output_path, "w", encoding="utf-8") as fp_out:
for line_no, (orig, trans) in enumerate(map(lambda x: x.split("\t", 1), fp_in)):
if line_no >= args.first_n:
break
text = args.template.format(orig=orig.strip(), translated=trans.strip())
if len(text) <= args.filter_by_len:
fp_out.write(json.dumps({"text": text}, ensure_ascii=False) + "\n")
lengths.update([len(text)])
# print(lengths.most_common())
running_total = 0
full_total = sum(lengths.values()) or 1
target_lengths = {256: False, 512: False, 768: False, 1024: False, 2048: False}
for k in sorted(lengths.keys()):
running_total += lengths[k]
for target, flag in target_lengths.items():
if k >= target and not flag:
print(k, running_total / full_total)
target_lengths[target] = True
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"input_path", type=pathlib.Path, help="Path to the file with the parallel corpus in tab separated format"
)
parser.add_argument(
"output_path",
type=pathlib.Path,
help="Path to the jsonl file with generated instructions for lora finetuning",
)
parser.add_argument(
"--template",
default="[INST] {orig} [/INST] {translated}",
help="Instruction template"
)
parser.add_argument(
"--first-n",
type=int,
default=1000,
help="Number of sentences to draw from the input file"
)
parser.add_argument(
"--filter-by-len",
type=int,
default=1000000,
help="Do not include instructions longer than that"
)
args = parser.parse_args()
main(args=args)