-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_buckets.py
executable file
·64 lines (54 loc) · 2.43 KB
/
merge_buckets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
# coding=utf-8
#
# Copyright 2022 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
Merges buckets with ranked lemmas classified by rank_unprocessed_lemmas_in_corpus.py.
Inputs:
--buckets: list of bucket files classified by
rank_unprocessed_lemmas_in_corpus.py, separated by a comma.
--examples: Optional. If CSV with examples is given with synsemclass_id and
synsemclass fields, the script additionally adds the class names.
--output: Output TXT file.
"""
import csv
import sys
if __name__ == "__main__":
import argparse
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--buckets",
default="bucket_2753494_1_freq_to_16.txt,bucket_2753494_2_freq_to_49.txt,bucket_2753494_3_freq_to_122.txt,bucket_2753494_4_freq_to_312.txt,bucket_2753494_5_freq_to_999999.txt",
type=str,
help="Bucket files.")
parser.add_argument("--examples",
default=None,
type=str,
help="Examples CSV file to extract class names (optional).")
parser.add_argument("--output", default="all_buckets_2753494.txt", type=str, help="All merged buckets.")
args=parser.parse_args()
if args.examples:
classes = dict()
with open(args.examples, "r", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
classes[row["synsemclass_id"]] = row["synsemclass"]
lines = dict()
for bucket in args.buckets.split(","):
with open(bucket, "r", encoding="utf-8") as fr:
for line in fr.readlines():
line = line.rstrip()
cols = line.split("\t")
for i in range(3, len(cols), 2):
if args.examples:
cols[i] = "{} / {}".format(cols[i], classes[cols[i]])
line = "\t".join(cols)
lines[line] = float(cols[2])
with open(args.output, "w", encoding="utf-8") as fw:
for line, _ in sorted(lines.items(), key=lambda x: x[1]):
print(line, file=fw)