-
Notifications
You must be signed in to change notification settings - Fork 1
/
catscan.py
99 lines (89 loc) · 2.26 KB
/
catscan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os, json, argparse, sys, datetime, time, csv, datetime
"""
bzcat latest-all.json.bz2 |grep '"site":"enwiki"' |wikibase-dump-filter --simplify 'keepRichValues=false' |jq '[.id,.sitelinks.enwiki,.claims.P31,.claims.P279]' -c >enwiki.ndjson
"""
def sortdict(d):
pass
# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--articles', action='store', required=True)
# Read arguments from the command line
args = parser.parse_args()
articles = args.articles
script = os.path.basename(sys.argv[0])[:-3]
arts = {}
print('reading dump data...', file=sys.stderr)
file = open(articles)
for line in file.readlines():
arr = json.loads(line.strip())
qit = arr[0]
art = arr[1].replace(' ', '_')
P31 = arr[2]
P279 = arr[3]
if art is None or len(art) == 0:
raise
a = arts.get(art)
if a is not None:
continue
arts[art] = (qit, P31, P279)
reader = csv.DictReader(open('catscan.tsv', 'r'), delimiter='\t')
cats = set()
for item in reader:
cat = item.get('title')
cats.add(cat)
print(len(cats))
fresh = []
noboth = []
noP31 = []
allP31 = {}
allP279 = {}
onlyP279 = {}
for art in cats:
a = arts.get(art)
if a is None:
fresh.append(art)
continue
qit = a[0]
P31 = a[1]
P279 = a[2]
if type(P279) is str:
P279 = [P279]
if P31 is None:
if P279 is None:
noboth.append(qit)
continue
noP31 = (qit, P279)
for p in P279:
g = onlyP279.get(p)
if g is None:
onlyP279[p] = [qit]
else:
g.append(qit)
g = allP279.get(p)
if g is None:
allP279[p] = [qit]
else:
g.append(qit)
continue
if P279 is None:
P279 = [None]
if type(P31) is str:
P31 = [P31]
for p in P279:
g = allP279.get(p)
if g is None:
allP279[p] = [qit]
else:
g.append(qit)
continue
for p in P31:
g = allP31.get(p)
if g is None:
allP31[p] = [qit]
else:
g.append(qit)
continue
for f in fresh:
print('* https://de.wikipedia.org/wiki/{}'.format(f))
for i in noboth:
print('* {{{{Q|{}}}}}'.format(i))