From cfffd95e7fb6b4f8e250062c9675d17a0de4717f Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Thu, 15 Feb 2024 17:33:54 -0500 Subject: [PATCH 1/3] ICD11 config & docs - Rename: ICD11 -> ICD11Foundation - Rename prefix: icd11 -> icd11.foundation - Add prefixes: icd11.schema, icd11.z - Add: intensional exclusions TSV (currently empty) - Add: SPARQL query for selecting all diseases --- .../icd11foundation-relevant-diseases.sparql | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 src/sparql/icd11foundation-relevant-diseases.sparql diff --git a/src/sparql/icd11foundation-relevant-diseases.sparql b/src/sparql/icd11foundation-relevant-diseases.sparql new file mode 100644 index 00000000..e767f914 --- /dev/null +++ b/src/sparql/icd11foundation-relevant-diseases.sparql @@ -0,0 +1,32 @@ +PREFIX rdfs: +PREFIX owl: + + +### All diseases +SELECT DISTINCT ?term ?label ?deprecated +WHERE { + { + { + ?s1 ?p1 ?term . + ?term rdfs:subClassOf* . + OPTIONAL { + ?term rdfs:label ?label + } + OPTIONAL { + ?term owl:deprecated ?deprecated + } + } + UNION + { + ?term ?p2 ?o2 . + ?term rdfs:subClassOf* . + OPTIONAL { + ?term rdfs:label ?label + } + OPTIONAL { + ?term owl:deprecated ?deprecated + } + } + } + FILTER(isIRI(?term)) +} From 81a2d47da010c018d0a6cba6daff501961907bc3 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Tue, 30 Jan 2024 19:51:32 -0500 Subject: [PATCH 2/3] ICD11 config & docs - Update: mondo-ingest-odk.yaml: New entry for ICD11 in 'components' - Updates from running 'make update_repo': - modified: docs/odk-workflows/ManageDocumentation.md - modified: docs/odk-workflows/RepositoryFileStructure.md - modified: src/ontology/Makefile - modified: src/ontology/run.sh - new file: src/scripts/run-command.sh - modified: src/scripts/update_repo.sh - Add: metadata/icd11.yml - Update: prefixes.csv --- src/ontology/Makefile | 2 +- src/ontology/metadata/icd11.yml | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 src/ontology/metadata/icd11.yml diff --git a/src/ontology/Makefile b/src/ontology/Makefile index baa9ada9..ed84b4ff 100644 --- a/src/ontology/Makefile +++ b/src/ontology/Makefile @@ -54,7 +54,7 @@ OBODATE ?= $(shell date +'%d:%m:%Y %H:%M') VERSION= $(TODAY) ANNOTATE_ONTOLOGY_VERSION = annotate -V $(ONTBASE)/releases/$(VERSION)/$@ --annotation owl:versionInfo $(VERSION) ANNOTATE_CONVERT_FILE = annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) convert -f ofn --output $@.tmp.owl && mv $@.tmp.owl $@ -OTHER_SRC = $(COMPONENTSDIR)/doid.owl $(COMPONENTSDIR)/gard.owl $(COMPONENTSDIR)/icd10cm.owl $(COMPONENTSDIR)/icd10who.owl $(COMPONENTSDIR)/icd11foundation.owl $(COMPONENTSDIR)/ncit.owl $(COMPONENTSDIR)/omim.owl $(COMPONENTSDIR)/ordo.owl +OTHER_SRC = $(COMPONENTSDIR)/doid.owl $(COMPONENTSDIR)/gard.owl $(COMPONENTSDIR)/icd10cm.owl $(COMPONENTSDIR)/icd10who.owl $(COMPONENTSDIR)/icd11foundation.owl $(COMPONENTSDIR)/ncit.owl $(COMPONENTSDIR)/omim.owl $(COMPONENTSDIR)/ordo.owl ONTOLOGYTERMS = $(TMPDIR)/ontologyterms.txt EDIT_PREPROCESSED = $(TMPDIR)/$(ONT)-preprocess.owl diff --git a/src/ontology/metadata/icd11.yml b/src/ontology/metadata/icd11.yml new file mode 100644 index 00000000..2d2143c7 --- /dev/null +++ b/src/ontology/metadata/icd11.yml @@ -0,0 +1,13 @@ +id: ICD11 +label: International Classification of Diseases 11th Revision +prefix_map: + ICD11: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD11/ +description: > + The International Classification of Diseases (ICD) provides a common language that allows health professionals to + share standardized information across the world. The eleventh revision contains around 17 000 unique codes, more than + 120 000 codable terms and is now entirely digital.Feb 11, 2022 + + This data source in particular is the ICD11 foundation, not one of its linearizations. +homepage: https://icd.who.int/ +base_prefix_map: + ICD11: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD11/ From 738ab75d009b10d937ed2853e1de64b5863f210d Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Thu, 8 Feb 2024 18:24:53 -0500 Subject: [PATCH 3/3] ICD11 Ingest - Update: mondo-ingest.Makefile - Add: $(COMPONENTSDIR)/icd11.owl - Add: config/icd11foundation-property-map.sssom.tsv - Update: ICD11 docs - Update: config/properties.txt - Update: config/context.json - Update: metadata/mondo.sssom.config.yml: added icd11.foundation to subject_prefixes - Update: lexmatch-sssom-compare.py: entry for icd11 - Update: add-new-source.md: Instructions for additional configuration necessities. General - Bugfix: Slurp files were sometimes getting removed because they were considered intermediates and not .PRECIOUS. - Bugfix: reports/*_exclusion_reasons.robot.template.tsv files were getting removed by the build for same reason as above. --- docs/developer/add-new-source.md | 6 ++++ docs/sources/icd11foundation.md | 23 +++++++++---- src/ontology/Makefile | 2 +- src/ontology/config/context.json | 2 ++ .../icd11foundation-property-map.sssom.tsv | 5 +++ src/ontology/config/properties.txt | 2 ++ src/ontology/metadata/icd11.yml | 13 -------- src/ontology/metadata/icd11foundation.yml | 14 ++++++++ src/ontology/metadata/mondo.sssom.config.yml | 1 + src/ontology/mondo-ingest.Makefile | 13 ++++++++ src/scripts/lexmatch-sssom-compare.py | 5 ++- .../icd11foundation-relevant-diseases.sparql | 32 ------------------- 12 files changed, 64 insertions(+), 54 deletions(-) create mode 100644 src/ontology/config/icd11foundation-property-map.sssom.tsv delete mode 100644 src/ontology/metadata/icd11.yml delete mode 100644 src/sparql/icd11foundation-relevant-diseases.sparql diff --git a/docs/developer/add-new-source.md b/docs/developer/add-new-source.md index b00c6faa..1b663ad2 100644 --- a/docs/developer/add-new-source.md +++ b/docs/developer/add-new-source.md @@ -21,6 +21,7 @@ Add a new metadata file to [src/ontology/metadata](https://github.com/monarch-in Prefixes need to be entered in the following places in the yml: - `curie_map` - `extended_prefix_map` +- `subject_prefixes` ### 2.3. `config/prefixes.csv` Add prefixes. @@ -28,6 +29,11 @@ Add prefixes. ### 2.4. `config/context.json` Add prefixes. +### 2.5. `lexmatch-sssom-compare.py` +There is a section of branching logic with a comment "Map ontology filenames to prefixes". Add an entry there if either +(a) there is 1 prefix you care about, and it is spelled differently than the component filename (e.g. the prefix is +`myontology`, but the filename is `components/my-ontology.owl`), or (b) there is more than 1 prefix. + ## 3. Docs ### 3.1. `mkdocs.yaml` Update the Website Table of Contents in [mkdocs.yaml](https://github.com/monarch-initiative/mondo-ingest/blob/main/mkdocs.yaml) diff --git a/docs/sources/icd11foundation.md b/docs/sources/icd11foundation.md index 8ab32c53..289b6f50 100644 --- a/docs/sources/icd11foundation.md +++ b/docs/sources/icd11foundation.md @@ -2,20 +2,29 @@ **Source name:** International Classification of Diseases 11th Revision -**Source description:** The International Classification of Diseases (ICD) provides a common language that allows health professionals to share standardized information across the world. The eleventh revision contains around 17 000 unique codes, more than 120 000 codable terms and is now entirely digital.Feb 11, 2022 +**Source description:** The International Classification of Diseases (ICD) provides a common language that allows health +professionals to share standardized information across the world. The eleventh revision contains around 17 000 unique +codes, more than 120 000 codable terms and is now entirely digital.Feb 11, 2022 This data source in particular is the ICD11 foundation, not one of its linearizations. - **Homepage:** https://icd.who.int/ -**Comments about this source:** -Because the existing logical equivalence class axioms led to equivalence cliques (groups of distinct disease identifiers -that inferred to he semantically identical) we decided to strip out all equivalence class axiom from the foundation -prior to processing it in the ingest. - +**Comments about this source:** +_Data source_ +_Original source URL_: https://icd11files.blob.core.windows.net/tmp/whofic-2023-04-08.owl.gz +_Preprocessing_ +In the [monarch-initiative/icd11](https://github.com/monarch-initiative/icd11) repo, We remove unicode characters and +then remove equivalent class statements as discussed below. +_Equivalent classes_ +We remove all equivalent class statements as they are not unique and result in unintended node merges. For example +`icd11.foundation:2000662282` (_Occupant of pick-up truck or van injured in collision with car, pick-up truck or van: +person on outside of vehicle injured in traffic accident_) has the same exact equivalent concept expression as +`icd11.foundation:1279712844` (_Occupant of pick-up truck or van injured in collision with two- or three- wheeled motor +vehicle: person on outside of vehicle injured in traffic accident_). +--- The data pipeline that generates the source is implemented in `make`, in this source file: [src/ontology/mondo-ingest.Makefile](https://github.com/monarch-initiative/mondo-ingest/blob/main/src/ontology/mondo-ingest.Makefile). diff --git a/src/ontology/Makefile b/src/ontology/Makefile index ed84b4ff..baa9ada9 100644 --- a/src/ontology/Makefile +++ b/src/ontology/Makefile @@ -54,7 +54,7 @@ OBODATE ?= $(shell date +'%d:%m:%Y %H:%M') VERSION= $(TODAY) ANNOTATE_ONTOLOGY_VERSION = annotate -V $(ONTBASE)/releases/$(VERSION)/$@ --annotation owl:versionInfo $(VERSION) ANNOTATE_CONVERT_FILE = annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) convert -f ofn --output $@.tmp.owl && mv $@.tmp.owl $@ -OTHER_SRC = $(COMPONENTSDIR)/doid.owl $(COMPONENTSDIR)/gard.owl $(COMPONENTSDIR)/icd10cm.owl $(COMPONENTSDIR)/icd10who.owl $(COMPONENTSDIR)/icd11foundation.owl $(COMPONENTSDIR)/ncit.owl $(COMPONENTSDIR)/omim.owl $(COMPONENTSDIR)/ordo.owl +OTHER_SRC = $(COMPONENTSDIR)/doid.owl $(COMPONENTSDIR)/gard.owl $(COMPONENTSDIR)/icd10cm.owl $(COMPONENTSDIR)/icd10who.owl $(COMPONENTSDIR)/icd11foundation.owl $(COMPONENTSDIR)/ncit.owl $(COMPONENTSDIR)/omim.owl $(COMPONENTSDIR)/ordo.owl ONTOLOGYTERMS = $(TMPDIR)/ontologyterms.txt EDIT_PREPROCESSED = $(TMPDIR)/$(ONT)-preprocess.owl diff --git a/src/ontology/config/context.json b/src/ontology/config/context.json index 89834463..793dff38 100644 --- a/src/ontology/config/context.json +++ b/src/ontology/config/context.json @@ -31,6 +31,8 @@ "NCBITaxon": "http://purl.obolibrary.org/obo/NCBITaxon_", "ICD10CM": "http://purl.bioontology.org/ontology/ICD10CM/", "ICD10WHO": "http://apps.who.int/classifications/icd10/browse/2010/en#/", + "icd11.foundation": "http://id.who.int/icd/entity/", + "icd11.z": "http://who.int/icd#Z_", "OMIMPS": "https://omim.org/phenotypicSeries/PS", "MONDOREL": "http://purl.obolibrary.org/obo/mondo#" } diff --git a/src/ontology/config/icd11foundation-property-map.sssom.tsv b/src/ontology/config/icd11foundation-property-map.sssom.tsv new file mode 100644 index 00000000..2cfd32be --- /dev/null +++ b/src/ontology/config/icd11foundation-property-map.sssom.tsv @@ -0,0 +1,5 @@ +subject_id object_id +http://id.who.int/icd/schema/isObsolote owl:deprecated +http://id.who.int/icd/schema/longDefinition http://purl.org/dc/terms/description +http://id.who.int/icd/schema/note rdfs:comment +skos:definition IAO:0000115 diff --git a/src/ontology/config/properties.txt b/src/ontology/config/properties.txt index 33de4e8c..79b56639 100644 --- a/src/ontology/config/properties.txt +++ b/src/ontology/config/properties.txt @@ -20,6 +20,8 @@ http://www.w3.org/2004/02/skos/core#narrowMatch http://www.w3.org/2004/02/skos/core#relatedMatch http://www.w3.org/2004/02/skos/core#exactMatch http://www.w3.org/2004/02/skos/core#closeMatch +rdfs:comment rdfs:label rdfs:seeAlso owl:deprecated +http://purl.org/dc/terms/description diff --git a/src/ontology/metadata/icd11.yml b/src/ontology/metadata/icd11.yml deleted file mode 100644 index 2d2143c7..00000000 --- a/src/ontology/metadata/icd11.yml +++ /dev/null @@ -1,13 +0,0 @@ -id: ICD11 -label: International Classification of Diseases 11th Revision -prefix_map: - ICD11: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD11/ -description: > - The International Classification of Diseases (ICD) provides a common language that allows health professionals to - share standardized information across the world. The eleventh revision contains around 17 000 unique codes, more than - 120 000 codable terms and is now entirely digital.Feb 11, 2022 - - This data source in particular is the ICD11 foundation, not one of its linearizations. -homepage: https://icd.who.int/ -base_prefix_map: - ICD11: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/ICD11/ diff --git a/src/ontology/metadata/icd11foundation.yml b/src/ontology/metadata/icd11foundation.yml index e3417a71..6e39f924 100644 --- a/src/ontology/metadata/icd11foundation.yml +++ b/src/ontology/metadata/icd11foundation.yml @@ -10,6 +10,20 @@ description: > 120 000 codable terms and is now entirely digital.Feb 11, 2022 This data source in particular is the ICD11 foundation, not one of its linearizations. +comments_about_this_source: > + _Data source_ + _Original source URL_: https://icd11files.blob.core.windows.net/tmp/whofic-2023-04-08.owl.gz + + _Preprocessing_ + In the [monarch-initiative/icd11](https://github.com/monarch-initiative/icd11) repo, We remove unicode characters and + then remove equivalent class statements as discussed below. + + _Equivalent classes_ + We remove all equivalent class statements as they are not unique and result in unintended node merges. For example + `icd11.foundation:2000662282` (_Occupant of pick-up truck or van injured in collision with car, pick-up truck or van: + person on outside of vehicle injured in traffic accident_) has the same exact equivalent concept expression as + `icd11.foundation:1279712844` (_Occupant of pick-up truck or van injured in collision with two- or three- wheeled motor + vehicle: person on outside of vehicle injured in traffic accident_). homepage: https://icd.who.int/ base_prefix_map: icd11.foundation: http://id.who.int/icd/entity/ diff --git a/src/ontology/metadata/mondo.sssom.config.yml b/src/ontology/metadata/mondo.sssom.config.yml index 2aefad03..0d95ed78 100644 --- a/src/ontology/metadata/mondo.sssom.config.yml +++ b/src/ontology/metadata/mondo.sssom.config.yml @@ -367,6 +367,7 @@ subject_prefixes: - EFO - ICD10CM - ICD10WHO + - icd11.foundation - OMIMPS - NCIT - DOID diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile index 099dc9e0..0b84a20e 100644 --- a/src/ontology/mondo-ingest.Makefile +++ b/src/ontology/mondo-ingest.Makefile @@ -159,6 +159,17 @@ $(COMPONENTSDIR)/icd10who.owl: $(TMPDIR)/icd10who_relevant_signature.txt | compo remove -T config/properties.txt --select complement --select properties --trim true \ annotate --ontology-iri $(URIBASE)/mondo/sources/icd10who.owl --version-iri $(URIBASE)/mondo/sources/$(TODAY)/icd10who.owl -o $@; fi +$(COMPONENTSDIR)/icd11foundation.owl: $(TMPDIR)/icd11foundation_relevant_signature.txt | component-download-icd11foundation.owl + if [ $(COMP) = true ] ; then $(ROBOT) remove -i $(TMPDIR)/component-download-icd11foundation.owl.owl --select imports \ + rename --mappings config/property-map-1.sssom.tsv --allow-missing-entities true \ + rename --mappings config/icd11foundation-property-map.sssom.tsv \ + remove -T $(TMPDIR)/icd11foundation_relevant_signature.txt --select complement --select "classes individuals" --trim false \ + remove -T $(TMPDIR)/icd11foundation_relevant_signature.txt --select individuals \ + query \ + --update ../sparql/fix-labels-with-brackets.ru \ + remove -T config/properties.txt --select complement --select properties --trim true \ + annotate --ontology-iri $(URIBASE)/mondo/sources/icd11foundation.owl --version-iri $(URIBASE)/mondo/sources/$(TODAY)/icd11foundation.owl -o $@; fi + $(COMPONENTSDIR)/gard.owl: $(TMPDIR)/gard_relevant_signature.txt | component-download-gard.owl if [ $(COMP) = true ]; then $(ROBOT) remove -i $(TMPDIR)/component-download-gard.owl.owl --select imports \ remove -T $(TMPDIR)/gard_relevant_signature.txt --select complement --select "classes individuals" --trim false \ @@ -246,6 +257,7 @@ $(REPORTDIR)/%_term_exclusions.txt $(REPORTDIR)/%_exclusion_reasons.robot.templa --config-path metadata/$*.yml \ --outpath-txt $(REPORTDIR)/$*_term_exclusions.txt \ --outpath-robot-template-tsv $(REPORTDIR)/$*_exclusion_reasons.robot.template.tsv +.PRECIOUS: $(REPORTDIR)/%_exclusion_reasons.robot.template.tsv $(REPORTDIR)/%_exclusion_reasons.ttl: component-download-%.owl $(REPORTDIR)/%_exclusion_reasons.robot.template.tsv $(ROBOT) template --input $(TMPDIR)/component-download-$*.owl.owl --add-prefixes config/context.json --template $(REPORTDIR)/$*_exclusion_reasons.robot.template.tsv --output $(REPORTDIR)/$*_exclusion_reasons.ttl @@ -476,6 +488,7 @@ slurp/%.tsv: $(COMPONENTSDIR)/%.owl $(TMPDIR)/mondo.sssom.tsv $(REPORTDIR)/%_map --mondo-terms-path $(REPORTDIR)/mirror_signature-mondo.tsv \ --slurp-dir-path slurp/ \ --outpath $@ +.PRECIOUS: slurp/%.tsv .PHONY: slurp-% slurp-%: slurp/%.tsv diff --git a/src/scripts/lexmatch-sssom-compare.py b/src/scripts/lexmatch-sssom-compare.py index a5914b31..e6b0da03 100644 --- a/src/scripts/lexmatch-sssom-compare.py +++ b/src/scripts/lexmatch-sssom-compare.py @@ -173,11 +173,14 @@ def extract_unmapped_matches(input: str, matches: TextIO, output_dir: str, summa ont_df_list = [] for _, ont in enumerate(input): + # Map ontology filenames to prefixes ont2 = ont.upper() if ont == "omim": ont2 = "|".join((["OMIM", "OMIMPS"])) elif ont == "ordo": ont2 = "|".join((["ORDO", "Orphanet"])) + elif ont == "icd11foundation": + ont2 = 'icd11.foundation' mondo_ont_df = msdf_mondo.df[condition_mondo_sssom_subj & msdf_mondo.df['object_id'].str.contains(ont2)] mondo_ont_lex_df = lex_df[(condition_lex_df_mondo_subj & lex_df['object_id'].str.contains(ont2))] @@ -201,7 +204,7 @@ def extract_unmapped_matches(input: str, matches: TextIO, output_dir: str, summa ont_df_list.append(unmapped_ont_df) - combined_df = pd.concat(ont_df_list) + combined_df = pd.concat(ont_df_list) if ont_df_list else pd.DataFrame() combined_msdf = MappingSetDataFrame( df=combined_df, converter=msdf_lex.converter, metadata=msdf_lex.metadata diff --git a/src/sparql/icd11foundation-relevant-diseases.sparql b/src/sparql/icd11foundation-relevant-diseases.sparql deleted file mode 100644 index e767f914..00000000 --- a/src/sparql/icd11foundation-relevant-diseases.sparql +++ /dev/null @@ -1,32 +0,0 @@ -PREFIX rdfs: -PREFIX owl: - - -### All diseases -SELECT DISTINCT ?term ?label ?deprecated -WHERE { - { - { - ?s1 ?p1 ?term . - ?term rdfs:subClassOf* . - OPTIONAL { - ?term rdfs:label ?label - } - OPTIONAL { - ?term owl:deprecated ?deprecated - } - } - UNION - { - ?term ?p2 ?o2 . - ?term rdfs:subClassOf* . - OPTIONAL { - ?term rdfs:label ?label - } - OPTIONAL { - ?term owl:deprecated ?deprecated - } - } - } - FILTER(isIRI(?term)) -}