-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
428 lines (321 loc) · 13.7 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# IHCC Demonstration Makefile
# James A. Overton <[email protected]>
#
# WARN: This file contains significant whitespace, i.e. tabs!
# Ensure that your text editor shows you those characters.
### Workflow
# The following workflow defines all tasks necessary to upload,
# preprocess, share, and map a new data dictionary.
#
# 1. [Upload cohort data](./src/workflow.py?action=create)
# 2. [Open Google Sheet](./src/workflow.py?action=open)
# 3. [Run automated mapping for new data dictionary](automated_mapping)
# 4. [Share Google Sheet with submitter](./src/workflow.py?action=share)
# 5. [Prepare data dictionary for build](prepare_build)
# 6. [Run automated validation](automated_validation)
# 7. [Build data dictionary](all)
# 8. [View results](build/)
# 9. [Add data dictionary to Version Control](finalize)
# 10. Prepare git commit (click on `Commit` in `Version Control` menu)
# 11. Push changes to GitHub (click on `Push` in `Version Control` menu), and make pull request.
# 12. [Delete Google sheet (Caution, cannot be undone)](cogs_delete)
#
#### IHCC Data Admin Tasks
# * [Update all data, including data dictionaries](all)
# * [Update only data dictionaries](owl)
# * [Run all mappings (quality control)](all_mapping_suggest)
# * [Clean build directory](clean)
# [](./src/tree.sh)
### Configuration
#
# These are standard options to make Make sane:
# <http://clarkgrubb.com/makefile-style-guide#toc2>
MAKEFLAGS += --warn-undefined-variables
SHELL := bash
.SHELLFLAGS := -eu -o pipefail -c
.DEFAULT_GOAL := all
.DELETE_ON_ERROR:
.SUFFIXES:
.SECONDARY:
ROBOT = java -jar build/robot.jar --prefixes src/prefixes.json
GECKO_PURL = http://purl.obolibrary.org/obo/gecko/views/ihcc-gecko.owl
TODAY ?= $(shell date +%Y-%m-%d)
# Detect the OS and provide proper command
# WARNING - will not work with Windows OS
UNAME = $(shell uname)
ifeq ($(UNAME), Darwin)
SED = sed -i .bak
else
SED = sed -i
endif
### Variables
# List of cohorts to generate files for (lowercase, using short ID where possible)
# This short name is used throughout all build tasks and should be consistent for all files
COHORTS := $(filter-out maelstrom, $(patsubst %.ttl, %, $(notdir $(wildcard metadata/*.ttl))))
TEMPLATES := $(foreach C,$(COHORTS),templates/$(C).tsv)
METADATA := $(foreach C,$(COHORTS),metadata/$(C).ttl)
ZOOMA_DATASET = data/ihcc-mapping-suggestions-zooma.tsv
OLS_CONFIG = data/ols-config.yaml
# --- These files are not in version control (all in build directory) ---
# OWL file in the build directory for all cohorts (contains xrefs)
ONTS := $(foreach C,$(COHORTS),build/$(C).owl)
# HTML tree browser and table for each cohort
DBS := build/gecko.db $(foreach C,$(COHORTS),build/$(C).db)
TABLES := $(foreach C,$(COHORTS),build/$(C).html)
### General Tasks
.PHONY: clean
clean:
rm -rf build
.PHONY: all
all: $(DBS) $(TABLES)
all: build/index.html
all: data/cohort-data.json
all: build/member_cohorts.csv
all: owl
all: $(ZOOMA_DATASET)
all: $(OLS_CONFIG)
.PHONY: update
update: clean all
build/index.html: src/create_index.py src/index.html.jinja2 data/metadata.json | $(DBS) $(TABLES)
python3 $^ $@
.PHONY: finalize
finalize: src/finalize.py build/metadata.tsv
python3 $^
### MAELSTROM Tasks
.PHONY: refresh-maelstrom
refresh-maelstrom:
rm data/maelstrom.yml
make data/maelstrom.yml
data/maelstrom.yml:
curl -Lk https://raw.githubusercontent.com/maelstrom-research/maelstrom-taxonomies/master/AreaOfInformation.yml > $@
.PHONY: update-maelstrom-template
update-maelstrom-template: src/convert/maelstrom.py data/maelstrom.yml templates/maelstrom.tsv
python3 $^
# Maelstrom has different headers than other templates
build/maelstrom.tsv: templates/maelstrom.tsv
sed -E '2s/^/ID AL rdfs:label@en AL definition@en SC % AL label@fr AL definition@fr A internal ID# is-required;#/' $< | tr '#' '\n' > $@
### Cohort OWL Files
# Run `make owl` to generate all cohort OWL files
.PHONY: owl
owl: $(ONTS) | data_dictionaries
cp $^ data_dictionaries/
build/%.tsv: templates/%.tsv
sed -E '2s/^/ID LABEL C % SPLIT=| A definition A internal ID# is-required;#/' $< | tr '#' '\n' > $@
build/%.owl: build/intermediate/properties.owl build/%.tsv build/intermediate/%-xrefs.tsv metadata/%.ttl | build/robot.jar
$(ROBOT) template --input $< \
--merge-before \
--template $(word 2,$^) \
template \
--template $(word 3,$^) \
--merge-before \
annotate --ontology-iri "https://purl.ihccglobal.org/$(notdir $@)" \
--version-iri "https://purl.ihccglobal.org/$(notdir $(basename $@))/releases/$(TODAY)/$(notdir $@)" \
--annotation-file $(word 4,$^) \
--output $@
# Generate a metadata file for the current cohort, move the terminology to templates, & add the prefix
# TODO - this should not be a phony task name,
# ideally we should use the branch name here but all other tasks are using the "metadata" file
.PHONY: prepare_build
prepare_build: src/prepare.py build/metadata.tsv build/terminology.tsv src/prefixes.json data/metadata.json
python3 $^
### GECKO Mapping TSVs
# Intermediate files for GECKO mappings
# GECKO terms as xrefs for main files
# These are required to build the cohort OWL file
# The xrefs are generated from the mapping template
build/intermediate/gecko_index.tsv: build/gecko.owl | build/robot.jar build/intermediate
$(ROBOT) export --input $< \
--header "ID|LABEL" \
--sort "ID" \
--export $@
build/intermediate/%-xrefs.tsv: src/create_xref_template.py build/%.tsv build/intermediate/gecko_index.tsv | build/intermediate
python3 $^ $@
### Trees and Tables
build/prefixes.sql: src/convert_prefixes.py src/prefixes.json | build
python3 $^ $@
build/%.db: build/prefixes.sql build/%.owl | build/rdftab
rm -rf $@
sqlite3 $@ < $<
./build/rdftab $@ < $(word 2,$^)
UC = $(shell echo '$*' | tr '[:lower:]' '[:upper:]')
PREDICATES := CURIE label rdfs:subClassOf definition oboInOwl:hasDbXref IHCC:internal-id
build/%.html: build/%.db
python3 -m gizmos.export \
-d $< \
-f html \
$(foreach P,$(PREDICATES),-p $(P)) \
-V CURIE \
-w "stanza LIKE '$(call UC,$(basename $(notdir $@))):%'" | \
sed 's/rdfs:subClassOf/parent/g' | \
sed 's/oboInOwl:hasDbXref/GECKO category/g' | \
sed 's/IHCC:internal-id/internal ID/g' > $@
### JSON Data for Real Browser
# Top-level cohort data
build/gecko_structure.json: src/get_gecko_structure.py build/gecko.db
python3 $^ > $@
data/cohort-data.json: src/generate_cohort_json.py data/metadata.json build/gecko_structure.json $(TEMPLATES)
python3 $(filter-out $(TEMPLATES),$^) $@
# Real cohort data + randomly-generated cohort data
data/full-cohort-data.json: data/cohort-data.json data/random-data.json
rm -f $@
sed '$$d' $< | sed '$$d' >> $@
echo ' },' >> $@
sed '1d' $(word 2,$^) >> $@
### Admin Metadata
build/member_cohorts.csv: src/convert_metadata.py data/metadata.json
python3 $^ $@
### COGS Set Up & Tasks
# The branch name should be the namespace for the new cohort
BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
init-cogs: .cogs
templates/$(BRANCH).tsv:
echo -e "Term ID\tLabel\tParent Term\tDefinition\tGECKO Category\tSuggested Categories\tComment\n" > $@
# required env var GOOGLE_CREDENTIALS
.cogs: | templates/$(BRANCH).tsv
cogs init -u $(EMAIL) -t $(BRANCH)
cogs add templates/$(BRANCH).tsv -r 2
cogs push
cogs open
cogs-apply-%: build/cogs-%.tsv
cogs apply $<
destroy-cogs: | .cogs
cogs delete -f
### Pre-build Tasks
build build/intermediate build/browser build/browser/cohorts data_dictionaries:
mkdir -p $@
build/robot.jar: | build
curl -Lk -o $@ https://build.obolibrary.io/job/ontodev/job/robot/job/master/lastSuccessfulBuild/artifact/bin/robot.jar
UNAME := $(shell uname)
ifeq ($(UNAME), Darwin)
RDFTAB_URL := https://github.com/ontodev/rdftab.rs/releases/download/v0.1.1/rdftab-x86_64-apple-darwin
else
RDFTAB_URL := https://github.com/ontodev/rdftab.rs/releases/download/v0.1.1/rdftab-x86_64-unknown-linux-musl
endif
build/rdftab: | build
curl -L -o $@ $(RDFTAB_URL)
chmod +x $@
build/intermediate/properties.owl: templates/properties.tsv | build/intermediate build/robot.jar
$(ROBOT) template --template $< --output $@
build/gecko.owl: | build
curl -L -o $@ $(GECKO_PURL)
# ------------------------------------------------------------------------------------------------
# Cohort data dictionary in JSON format
build/browser/%-data.json: build/%.owl | build/browser build/robot.jar
$(ROBOT) export \
--input $< \
--header "ID|LABEL|definition|subclasses" \
--sort "LABEL" \
--export $@
# Full cohort data (copied)
build/browser/cohorts.json: data/full-cohort-data.json | build/browser
cp $< $@
# Cohort metadata (copied)
build/browser/metadata.json: data/metadata.json | build/browser
cp $< $@
# This is used to drive aggregations
build/browser/cineca.json: build/gecko.owl | build/browser
$(ROBOT) export \
--input $< \
--header "ID|LABEL|subclasses" \
--sort "LABEL" \
--export $@
build/browser/index.html: src/browser/browser.html | build/browser
cp $< $@
# JSON of ancestor GECKO term (key) -> list of cohort terms (value)
# This is used to drive the filter functionality in the browser
build/browser/%-mapping.json: src/browser/generate_mapping_json.py build/%.tsv build/intermediate/gecko_index.tsv | build/browser
python3 $^ $@
# Top-level cohort data as HTML pages
COHORT_PAGES := $(foreach C,$(COHORTS),build/browser/cohorts/$(C).html)
$(COHORT_PAGES): src/browser/create_cohort_html.py data/cohort-data.json data/metadata.json src/browser/cohort.html.jinja2 build/browser/cohorts
python3 $^
BROWSER := build/browser/index.html \
build/browser/cineca.json \
build/browser/cohorts.json \
build/browser/metadata.json \
$(foreach C,$(COHORTS),build/browser/$(C)-mapping.json) \
$(foreach C,$(COHORTS),build/browser/$(C)-data.json) \
$(COHORT_PAGES)
.PHONY: browser
browser: $(BROWSER)
.PHONY: serve
serve: $(BROWSER)
cd build/browser && python3 -m http.server 8000
##################################################
####### IHCC Mapping validation pipeline #########
##################################################
build/gecko_labels.tsv: build/gecko.owl | build/robot.jar
$(ROBOT) export \
--input $< \
--header "LABEL" \
--export $@
# We always get the latest changes before running validation
build/cogs-problems.tsv: src/validate.py src/metadata-schema.json data/metadata.json build/terminology.tsv build/gecko_labels.tsv
python3 $^ $@
.PHONY: automated_validation
automated_validation:
make cogs_pull
make cogs-apply-problems
cogs push
###################################################
####### IHCC Mapping suggestions pipeline #########
###################################################
# Pipeline to generate mapping suggestions for a template. The template file is loaded,
# the suggestions generated and added as a colum "Suggested Categories" to the template.
MAP_SUGGEST := $(foreach C, $(COHORTS), build/suggestions_$(C).tsv)
GECKO_LEXICAL = build/intermediate/gecko-xrefs-sparql.csv
MAP_SCRIPT_DIR = src/mapping-suggest
MAP_SCRIPT_CONFIG = $(MAP_SCRIPT_DIR)/mapping-suggest-config.yml
.PHONY: all_mapping_suggest
all_mapping_suggest: src/mapping-suggest/mapping-suggest-qc.py $(MAP_SUGGEST)
python3 $< --templates $(MAP_SUGGEST) -v -o build/$@_report.tsv
.PHONY: id_generation_%
id_generation_%: $(MAP_SCRIPT_DIR)/id-generator-templates.py templates/%.tsv
python3 $< -t $(word 2,$^)
id_generation_cogs: $(MAP_SCRIPT_DIR)/id-generator-templates.py templates/cogs.tsv build/metadata.tsv
python3 $< -t $(word 2,$^) -m build/metadata.tsv
build/intermediate/%_mapping_suggestions_nlp.tsv: $(MAP_SCRIPT_DIR)/mapping-suggest-nlp.py \
templates/%.tsv $(GECKO_LEXICAL) \
id_generation_% | build/intermediate
python3 $< -z $(ZOOMA_DATASET) -c $(MAP_SCRIPT_CONFIG) -t templates/$*.tsv -g $(GECKO_LEXICAL) -o $@
build/intermediate/%_mapping_suggestions_zooma.tsv: $(MAP_SCRIPT_DIR)/mapping-suggest-zooma.py \
$(MAP_SCRIPT_CONFIG) templates/%.tsv \
id_generation_% | build/intermediate
python3 $< -c $(MAP_SCRIPT_CONFIG) -t templates/$*.tsv -o $@
# All of the mapping suggestion tables should have the following columns: ["confidence", "match", "match_label"]
build/suggestions_%.tsv: templates/%.tsv \
build/intermediate/%_mapping_suggestions_zooma.tsv \
build/intermediate/%_mapping_suggestions_nlp.tsv
python3 $(MAP_SCRIPT_DIR)/merge-mapping-suggestions.py -t $< $(patsubst %, -s %, $(filter-out $<,$^)) -o $@
build/cogs-data-validation.tsv build/cogs-info-table.tsv: $(MAP_SCRIPT_DIR)/create-data-validation.py build/terminology.tsv build/gecko_labels.tsv
python3 $^ build/cogs-data-validation.tsv build/cogs-info-table.tsv
# Pipeline to build a the zooma dataset that stores the existing mappings
MAP_DATA := $(foreach C, $(COHORTS), build/intermediate/$(C)-xrefs-sparql.csv)
# TODO: Should this depend on data_dictionaries/%.owl or better build/%.owl?
build/intermediate/%-xrefs-sparql.csv: build/%.owl src/queries/ihcc-mapping.sparql | build/intermediate build/robot.jar
$(ROBOT) query --input $< --query $(word 2,$^) $@
$(GECKO_LEXICAL): build/gecko.owl src/queries/ihcc-mapping-gecko.sparql | build/intermediate build/robot.jar
$(ROBOT) query --input $< --query $(word 2,$^) $@
$(ZOOMA_DATASET): $(MAP_SCRIPT_DIR)/ihcc-zooma-dataset.py $(GECKO_LEXICAL) $(MAP_DATA)
python3 $< $(patsubst %, -l %, $(filter-out $<,$^)) -w $(shell pwd) -o $@
.PHONY: update_ols
update_ols: $(OLS_CONFIG)
$(OLS_CONFIG): src/prepare_ols_config.py $(METADATA)
python3 $< $(patsubst %, -m %, $(METADATA)) -o $(OLS_CONFIG)
.PHONY: cogs_pull
cogs_pull:
cogs fetch
cogs pull
.PHONY: cogs_delete
cogs_delete:
cogs delete -f
.PHONY: automated_mapping
automated_mapping:
make cogs_pull
cp build/terminology.tsv templates/cogs.tsv
make build/suggestions_cogs.tsv
cp build/suggestions_cogs.tsv build/terminology.tsv
rm -f templates/cogs.tsv
make cogs-apply-data-validation
make cogs-apply-info-table
cogs push