-
Notifications
You must be signed in to change notification settings - Fork 1
/
add_initial_data.py
executable file
·187 lines (137 loc) · 5.83 KB
/
add_initial_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/python
from __future__ import print_function
import sys
import os
import glob
import itertools
import argparse
from flask import Flask
import openpyxl
import requests
from server.app import app
from server.models import *
from server.api import get_categories
# Expected spreadsheet headers
HEADERS = [('A', 'Concept'),
('B', 'Name'),
('C', 'Unit Code'),
('D', 'Taught'),
('E', 'Applied'),
('F', 'Assessed'),
('G', 'Context')]
def to_bool(val):
return (1 if (val == 'y') else 0)
def is_custom(title):
return title.startswith('UOM:')
def is_section(title):
return "#" in title
def normalize_custom_title(title):
""" Applies WP-like normalization to a custom topics title """
return title.replace('_', ' ')
def normalize_wp_title(title):
""" Applies WP normalization to a title, so we get it's canonical form"""
params = {
'action': 'query',
'titles': title,
'format': 'json',
'indexpageids': True
}
r = requests.get('http://en.wikipedia.org/w/api.php', params=params)
responce = r.json()
pageid = responce['query']['pageids'][0]
assert pageid != '-1', 'Title not found'
return responce['query']['pages'][pageid]['title']
def normalize_title(title):
""" Applies correct type of normalization depending on topic type """
if is_custom(title):
return normalize_custom_title(title[4:])
else:
return normalize_wp_title(title)
def process_workbook(workbook):
sheet = openpyxl.load_workbook(workbook).get_active_sheet()
for header in HEADERS:
if sheet.cell('%s1' % header[0]).value != header[1]:
print("Error : Invalid cell in spreadsheet header cell %s" % header[0])
sys.exit(1)
# We couldn't add contexts straight away, as corresponding topics might not
# yet be added. So we parse and save them here, and add after the topics are
# added
topic_contexts = []
for row in range(2,sheet.get_highest_row()+1):
if (sheet.cell('A%d' % row).value):
concept = dict()
for field in HEADERS:
concept[field[1]] = sheet.cell('%s%d' % (field[0], row)).value
# FIXME: Skipping sections for now
if is_section(concept['Concept']):
continue
# Before topic title is normalized - determine if it's custom
is_custom_concept = is_custom(concept['Concept'])
# Name might be just a duplicated identifier - we don't need it then
if concept['Name'] == concept['Concept']:
concept['Name'] = None
concept['Concept'] = normalize_title(concept['Concept'])
# Name might also be a normalized identifier - we don't need it either
if concept['Name'] == concept['Concept']:
concept['Name'] = None
topic, is_topic_new = None, None
if is_custom_concept:
topic = db.session.query(CustomTopic).filter_by(name=concept['Concept']).first()
is_topic_new = not topic
if is_topic_new:
topic = CustomTopic(concept['Concept'])
topic.description = 'Added from spreadsheets'
else:
topic = db.session.query(Topic).filter_by(name=concept['Concept']).first()
is_topic_new = not topic
if is_topic_new:
topic = Topic(concept['Concept'])
topic.categories = get_categories(topic)
if is_topic_new:
db.session.add(topic)
db.session.flush()
unit = db.session.query(Unit).filter_by(code=concept['Unit Code']).one()
unit_topic = UnitTopic(unit.id, topic.id)
unit_topic.alias = concept['Name']
unit_topic.is_taught = to_bool(concept['Taught'])
unit_topic.is_assessed = to_bool(concept['Assessed'])
unit_topic.is_applied = to_bool(concept['Applied'])
db.session.add(unit_topic)
db.session.commit()
if concept['Context']:
contexts = concept['Context'].split()
# FIXME: Remove sections for now
contexts = itertools.ifilterfalse(is_section, contexts)
# Normalise titles
contexts = map(normalize_title, contexts)
topic_contexts.append((unit_topic.id, contexts))
# Some lazy progress reporting
sys.stdout.write('.')
sys.stdout.flush()
for unit_topic_id, contexts in topic_contexts:
unit_topic = db.session.query(UnitTopic).filter_by(id=unit_topic_id).one()
unit_topic.contexts = db.session.query(Topic).filter(Topic.name.in_(contexts)).all()
print('Done')
def insert_units(units_filename):
with open(units_filename) as f:
for unit_line in f:
code, name = map(str.strip, unit_line.split(',', 1))
unit = Unit(code,name)
db.session.add(unit)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Insert initial data from spreadsheets')
parser.add_argument('db_uri', help='Database URI to insert to (e.g. sqlite:///test.db)')
parser.add_argument('data_dir', help='Directory with initial data')
args = parser.parse_args()
app.config['SQLALCHEMY_DATABASE_URI'] = args.db_uri
with app.app_context():
db.drop_all()
db.create_all()
units_filename = os.path.join(args.data_dir, 'units.txt')
insert_units(units_filename)
db.session.commit()
spreadsheets = glob.glob(os.path.join(args.data_dir, '*.xlsx'))
for workbook in spreadsheets:
print('Processing ' + workbook)
process_workbook(workbook)
db.session.commit()