diff --git a/notebooks/Clinical/OMOP-Example.ipynb b/notebooks/Clinical/OMOP-Example.ipynb index 201fbfbe3..5cfe1a741 100644 --- a/notebooks/Clinical/OMOP-Example.ipynb +++ b/notebooks/Clinical/OMOP-Example.ipynb @@ -2,43 +2,66 @@ "cells": [ { "cell_type": "markdown", + "id": "1940578a", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "# OMOP Examples\n", "\n", "See https://github.com/jhu-bids/TermHub/issues/516\n" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "id": "3b8c6747", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Basic term lookup" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 1, - "outputs": [], - "source": [ - "from oaklib import get_adapter\n", - "adapter = get_adapter('input/n3c.db')" - ], + "id": "4de67698", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2023-08-19T02:03:00.757982Z", "start_time": "2023-08-19T02:02:58.831821Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } - } + }, + "outputs": [], + "source": [ + "from oaklib import get_adapter\n", + "adapter = get_adapter('input/n3c.db')" + ] }, { "cell_type": "code", "execution_count": 2, + "id": "c58acbfa", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:03:00.777882Z", + "start_time": "2023-08-19T02:03:00.760498Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -51,31 +74,41 @@ "source": [ "TERM_ID = \"omop:4195673\"\n", "print(adapter.label(TERM_ID))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:03:00.777882Z", - "start_time": "2023-08-19T02:03:00.760498Z" - } - } + ] }, { "cell_type": "markdown", + "id": "5752d298", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Basic Search" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 3, + "id": "77c6eb24", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:03:05.066505Z", + "start_time": "2023-08-19T02:03:00.780740Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "['omop:1018433']" + "text/plain": [ + "['omop:1018433']" + ] }, "execution_count": 3, "metadata": {}, @@ -84,18 +117,22 @@ ], "source": [ "list(adapter.basic_search(\"Angioplasty\"))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:03:05.066505Z", - "start_time": "2023-08-19T02:03:00.780740Z" - } - } + ] }, { "cell_type": "code", "execution_count": 4, + "id": "0e1054d0", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:03:09.387547Z", + "start_time": "2023-08-19T02:03:05.069847Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -113,27 +150,35 @@ "from oaklib.datamodels.search import SearchConfiguration\n", "for t in list(adapter.basic_search(\"Angioplasty\", SearchConfiguration(is_partial=True)))[0:5]:\n", " print(t, adapter.label(t))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:03:09.387547Z", - "start_time": "2023-08-19T02:03:05.069847Z" - } - } + ] }, { "cell_type": "markdown", + "id": "76f14f50", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Graph Queries" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 5, + "id": "0de6da3d", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:03:09.411940Z", + "start_time": "2023-08-19T02:03:09.391590Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -173,18 +218,22 @@ "ancs = list(adapter.ancestors([TERM_ID], predicates=[IS_A]))\n", "for a in ancs:\n", " print(a, adapter.label(a))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:03:09.411940Z", - "start_time": "2023-08-19T02:03:09.391590Z" - } - } + ] }, { "cell_type": "code", "execution_count": 6, + "id": "23dceb5e", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:03:12.290810Z", + "start_time": "2023-08-19T02:03:09.411017Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -202,27 +251,35 @@ "descs = list(adapter.descendants([\"omop:4181322\"], predicates=[IS_A]))\n", "for d in descs[0:5]:\n", " print(d, adapter.label(d))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:03:12.290810Z", - "start_time": "2023-08-19T02:03:09.411017Z" - } - } + ] }, { "cell_type": "markdown", + "id": "1230ac38", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Semantic Similarity" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": 7, + "id": "cda772fc", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:03:17.051975Z", + "start_time": "2023-08-19T02:03:12.293483Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -241,40 +298,40 @@ "source": [ "from linkml_runtime.dumpers import yaml_dumper\n", "print(yaml_dumper.dumps(adapter.pairwise_similarity(TERM_ID, \"omop:2733612\")))" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:03:17.051975Z", - "start_time": "2023-08-19T02:03:12.293483Z" - } - } + ] }, { "cell_type": "markdown", + "id": "713ccfd2", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Paths" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, + "id": "a4922766", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:04:57.473444Z", + "start_time": "2023-08-19T02:04:57.379279Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4195673', 'Angioplasty of posterior tibial artery')]\n", - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4000756', 'Leg repair')]\n", - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4091623', 'Surgical repair of lower extremity')]\n", - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4181322', 'Surgical repair procedure by body site')]\n", - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4161058', 'Surgical repair of head and neck structure')]\n", - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4161828', 'Repair of eyelid')]\n", - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4330850', 'Reconstruction of eyelid')]\n", - "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness')]\n", "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4195673', 'Angioplasty of posterior tibial artery')]\n", "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4000756', 'Leg repair')]\n", "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4091623', 'Surgical repair of lower extremity')]\n", @@ -290,6 +347,14 @@ "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4185115', 'Surgical repair')]\n", "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4045162', 'Reconstruction procedure')]\n", "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4330850', 'Reconstruction of eyelid')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4195673', 'Angioplasty of posterior tibial artery')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4000756', 'Leg repair')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4091623', 'Surgical repair of lower extremity')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4181322', 'Surgical repair procedure by body site')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4161058', 'Surgical repair of head and neck structure')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4161828', 'Repair of eyelid')]\n", + "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4330850', 'Reconstruction of eyelid')]\n", "[('omop:4195673', 'Angioplasty of posterior tibial artery'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness'), ('omop:4210771', 'Reconstruction of eyelid, full-thickness')]\n" ] } @@ -297,123 +362,198 @@ "source": [ "for path in adapter.paths([TERM_ID], [\"omop:4210771\"], predicates=[IS_A]):\n", " print([(elt, adapter.label(elt)) for elt in path])" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:04:57.473444Z", - "start_time": "2023-08-19T02:04:57.379279Z" - } - } + ] }, { "cell_type": "markdown", + "id": "7a43673f", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Subgraphs" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 15, - "outputs": [], - "source": [ - "seeds = [TERM_ID, \"omop:4210771\", \"omop:2733612\"]\n", - "g = adapter.ancestor_graph(seeds, predicates=[IS_A])" - ], + "execution_count": 9, + "id": "7c9d31c3", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2023-08-19T02:09:05.190787Z", "start_time": "2023-08-19T02:09:05.113987Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } - } + }, + "outputs": [], + "source": [ + "seeds = [TERM_ID, \"omop:4210771\", \"omop:2733612\"]\n", + "g = adapter.ancestor_graph(seeds, predicates=[IS_A])" + ] }, { "cell_type": "code", - "execution_count": 16, - "outputs": [], - "source": [ - "from oaklib.utilities.obograph_utils import graph_to_image\n", - "graph_to_image(g, seeds=seeds, imgfile=\"output/angioplasty.png\", format=\"png\")" - ], + "execution_count": 10, + "id": "c75e6964", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2023-08-19T02:09:08.917990Z", "start_time": "2023-08-19T02:09:07.817451Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } - } + }, + "outputs": [], + "source": [ + "from oaklib.utilities.obograph_utils import graph_to_image\n", + "graph_to_image(g, seeds=seeds, imgfile=\"output/angioplasty.png\", format=\"png\")" + ] }, { "cell_type": "markdown", + "id": "1d249bf2", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "![img](output/angioplasty.png)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "id": "1a899a25", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Export to networkx" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 17, - "outputs": [], - "source": [ - "from oaklib.utilities.obograph_utils import as_multi_digraph\n", - "nx_g = as_multi_digraph(g)" - ], + "execution_count": 11, + "id": "75adc8f8", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2023-08-19T02:10:44.733552Z", "start_time": "2023-08-19T02:10:44.729004Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } - } + }, + "outputs": [], + "source": [ + "from oaklib.utilities.obograph_utils import as_multi_digraph\n", + "nx_g = as_multi_digraph(g)" + ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, + "id": "bf22958c", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:10:55.153851Z", + "start_time": "2023-08-19T02:10:55.148269Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "NodeView(('omop:4027561', 'omop:2733612', 'omop:40489873', 'omop:4177089', 'omop:4302652', 'omop:4054559', 'omop:4324523', 'omop:4301351', 'omop:4002031', 'omop:4148948', 'omop:4012185', 'omop:4159949', 'omop:4181322', 'omop:4185115', 'omop:4134598', 'omop:4330850', 'omop:4210771', 'omop:4045162', 'omop:4161828', 'omop:4161058', 'omop:4249123', 'omop:4139008', 'omop:4031321', 'omop:4154279', 'omop:4233946', 'omop:4040721', 'omop:4000756', 'omop:4195673', 'omop:4050134', 'omop:4190070', 'omop:4050128', 'omop:4062347', 'omop:4091623', 'omop:4160912', 'omop:46271049', 'omop:4030028', 'omop:4181193', 'omop:4311041', 'omop:4331725', 'omop:4184453'))" + "text/plain": [ + "NodeView(('omop:4027561', 'omop:2733612', 'omop:40489873', 'omop:4177089', 'omop:4302652', 'omop:4054559', 'omop:4324523', 'omop:4301351', 'omop:4002031', 'omop:4148948', 'omop:4012185', 'omop:4159949', 'omop:4181322', 'omop:4185115', 'omop:4134598', 'omop:4330850', 'omop:4210771', 'omop:4045162', 'omop:4161828', 'omop:4161058', 'omop:4249123', 'omop:4139008', 'omop:4031321', 'omop:4154279', 'omop:4233946', 'omop:4040721', 'omop:4000756', 'omop:4195673', 'omop:4050134', 'omop:4190070', 'omop:4050128', 'omop:4062347', 'omop:4091623', 'omop:4160912', 'omop:46271049', 'omop:4030028', 'omop:4181193', 'omop:4311041', 'omop:4331725', 'omop:4184453'))" + ] }, - "execution_count": 18, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nx_g.nodes" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:10:55.153851Z", - "start_time": "2023-08-19T02:10:55.148269Z" - } - } + ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 13, + "id": "60cba513", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:12:39.526490Z", + "start_time": "2023-08-19T02:12:39.518261Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'omop:4027561': 0.05128205128205128,\n 'omop:2733612': 0.05128205128205128,\n 'omop:40489873': 0.07692307692307693,\n 'omop:4177089': 0.07692307692307693,\n 'omop:4302652': 0.10256410256410256,\n 'omop:4054559': 0.10256410256410256,\n 'omop:4324523': 0.07692307692307693,\n 'omop:4301351': 0.1794871794871795,\n 'omop:4002031': 0.10256410256410256,\n 'omop:4148948': 0.1282051282051282,\n 'omop:4012185': 0.07692307692307693,\n 'omop:4159949': 0.05128205128205128,\n 'omop:4181322': 0.10256410256410256,\n 'omop:4185115': 0.10256410256410256,\n 'omop:4134598': 0.05128205128205128,\n 'omop:4330850': 0.07692307692307693,\n 'omop:4210771': 0.02564102564102564,\n 'omop:4045162': 0.05128205128205128,\n 'omop:4161828': 0.07692307692307693,\n 'omop:4161058': 0.07692307692307693,\n 'omop:4249123': 0.05128205128205128,\n 'omop:4139008': 0.07692307692307693,\n 'omop:4031321': 0.05128205128205128,\n 'omop:4154279': 0.05128205128205128,\n 'omop:4233946': 0.07692307692307693,\n 'omop:4040721': 0.02564102564102564,\n 'omop:4000756': 0.07692307692307693,\n 'omop:4195673': 0.05128205128205128,\n 'omop:4050134': 0.05128205128205128,\n 'omop:4190070': 0.1794871794871795,\n 'omop:4050128': 0.07692307692307693,\n 'omop:4062347': 0.07692307692307693,\n 'omop:4091623': 0.10256410256410256,\n 'omop:4160912': 0.07692307692307693,\n 'omop:46271049': 0.07692307692307693,\n 'omop:4030028': 0.10256410256410256,\n 'omop:4181193': 0.07692307692307693,\n 'omop:4311041': 0.07692307692307693,\n 'omop:4331725': 0.07692307692307693,\n 'omop:4184453': 0.05128205128205128}" + "text/plain": [ + "{'omop:4027561': 0.05128205128205128,\n", + " 'omop:2733612': 0.05128205128205128,\n", + " 'omop:40489873': 0.07692307692307693,\n", + " 'omop:4177089': 0.07692307692307693,\n", + " 'omop:4302652': 0.10256410256410256,\n", + " 'omop:4054559': 0.10256410256410256,\n", + " 'omop:4324523': 0.07692307692307693,\n", + " 'omop:4301351': 0.1794871794871795,\n", + " 'omop:4002031': 0.10256410256410256,\n", + " 'omop:4148948': 0.1282051282051282,\n", + " 'omop:4012185': 0.07692307692307693,\n", + " 'omop:4159949': 0.05128205128205128,\n", + " 'omop:4181322': 0.10256410256410256,\n", + " 'omop:4185115': 0.10256410256410256,\n", + " 'omop:4134598': 0.05128205128205128,\n", + " 'omop:4330850': 0.07692307692307693,\n", + " 'omop:4210771': 0.02564102564102564,\n", + " 'omop:4045162': 0.05128205128205128,\n", + " 'omop:4161828': 0.07692307692307693,\n", + " 'omop:4161058': 0.07692307692307693,\n", + " 'omop:4249123': 0.05128205128205128,\n", + " 'omop:4139008': 0.07692307692307693,\n", + " 'omop:4031321': 0.05128205128205128,\n", + " 'omop:4154279': 0.05128205128205128,\n", + " 'omop:4233946': 0.07692307692307693,\n", + " 'omop:4040721': 0.02564102564102564,\n", + " 'omop:4000756': 0.07692307692307693,\n", + " 'omop:4195673': 0.05128205128205128,\n", + " 'omop:4050134': 0.05128205128205128,\n", + " 'omop:4190070': 0.1794871794871795,\n", + " 'omop:4050128': 0.07692307692307693,\n", + " 'omop:4062347': 0.07692307692307693,\n", + " 'omop:4091623': 0.10256410256410256,\n", + " 'omop:4160912': 0.07692307692307693,\n", + " 'omop:46271049': 0.07692307692307693,\n", + " 'omop:4030028': 0.10256410256410256,\n", + " 'omop:4181193': 0.07692307692307693,\n", + " 'omop:4311041': 0.07692307692307693,\n", + " 'omop:4331725': 0.07692307692307693,\n", + " 'omop:4184453': 0.05128205128205128}" + ] }, - "execution_count": 21, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -422,78 +562,312 @@ "# find graph statistics using networkx\n", "import networkx as nx\n", "nx.degree_centrality(nx_g)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:12:39.526490Z", - "start_time": "2023-08-19T02:12:39.518261Z" - } - } + ] }, { "cell_type": "markdown", + "id": "101e0269", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## Term metadata" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, + "id": "50f31a17", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:14:39.686649Z", + "start_time": "2023-08-19T02:14:35.034364Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': ['omop:4195673'],\n 'omop:concept_class_id': ['Procedure'],\n 'omop:concept_code': ['312644004'],\n 'omop:domain_id': ['Procedure'],\n 'omop:standard_concept': ['S'],\n 'omop:valid_end_date': ['2099-12-31'],\n 'omop:valid_start_date': ['2002-01-31'],\n 'omop:vocabulary_id': ['SNOMED'],\n 'rdfs:label': ['Angioplasty of posterior tibial artery'],\n 'sh:prefix': ['omop'],\n 'schema:url': ['https://athena.ohdsi.org/search-terms/terms/4195673'],\n 'rdfs:isDefinedBy': ['https://athena.ohdsi.org/search-terms/terms/']}" + "text/plain": [ + "{'id': ['omop:4195673'],\n", + " 'omop:concept_class_id': ['Procedure'],\n", + " 'omop:concept_code': ['312644004'],\n", + " 'omop:domain_id': ['Procedure'],\n", + " 'omop:standard_concept': ['S'],\n", + " 'omop:valid_end_date': ['2099-12-31'],\n", + " 'omop:valid_start_date': ['2002-01-31'],\n", + " 'omop:vocabulary_id': ['SNOMED'],\n", + " 'rdfs:label': ['Angioplasty of posterior tibial artery'],\n", + " 'sh:prefix': ['omop'],\n", + " 'schema:url': ['https://athena.ohdsi.org/search-terms/terms/4195673'],\n", + " 'rdfs:isDefinedBy': ['https://athena.ohdsi.org/search-terms/terms/']}" + ] }, - "execution_count": 24, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adapter.entity_metadata_map(TERM_ID)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:14:39.686649Z", - "start_time": "2023-08-19T02:14:35.034364Z" - } - } + ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 15, + "id": "fb74f7a7", + "metadata": { + "ExecuteTime": { + "end_time": "2023-08-19T02:15:26.000340Z", + "start_time": "2023-08-19T02:15:25.994821Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [], "source": [ "adapter.label(\"omop:312644004\")" - ], + ] + }, + { + "cell_type": "markdown", + "id": "34a4367d-9271-4a78-949d-9905d59e5a7c", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2023-08-19T02:15:26.000340Z", - "start_time": "2023-08-19T02:15:25.994821Z" + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## Semantic Similarity using Rust" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "62ccb499-dce3-4c4c-a5b0-836af733ea7f", + "metadata": {}, + "outputs": [], + "source": [ + "adapter = get_adapter('semsimian:input/n3c.db')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e0c59472-b6a3-47ec-abdc-93db1cbfde39", + "metadata": {}, + "outputs": [], + "source": [ + "terms1 = [\"omop:4195673\", \"omop:4000756\", \"omop:4002031\", \"omop:4012185\"]\n", + "terms2 = [\"omop:4030028\", \"omop:4050128\", \"omop:4050134\", \"omop:4054559\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "92807c91-8ca0-4a8a-bece-460c9799670e", + "metadata": {}, + "outputs": [], + "source": [ + "tsps = adapter.termset_pairwise_similarity(terms1, terms2, predicates=[IS_A])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6402925b-bd2a-4fe2-b17d-28607f3f972a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "subject_termset:\n", + " omop:4002031:\n", + " id: omop:4002031\n", + " label: Cardiovascular system repair\n", + " omop:4195673:\n", + " id: omop:4195673\n", + " label: Angioplasty of posterior tibial artery\n", + " omop:4000756:\n", + " id: omop:4000756\n", + " label: Leg repair\n", + " omop:4012185:\n", + " id: omop:4012185\n", + " label: Cardiovascular surgical procedure\n", + "object_termset:\n", + " omop:4050128:\n", + " id: omop:4050128\n", + " label: Angioplasty of artery\n", + " omop:4030028:\n", + " id: omop:4030028\n", + " label: Surgical procedure on lower extremity\n", + " omop:4050134:\n", + " id: omop:4050134\n", + " label: Angioplasty of crural artery\n", + " omop:4054559:\n", + " id: omop:4054559\n", + " label: Repair of blood vessel\n", + "subject_best_matches:\n", + " omop:4000756:\n", + " match_source: omop:4000756\n", + " score: 10.984781775651545\n", + " similarity:\n", + " subject_id: omop:4000756\n", + " object_id: omop:4050134\n", + " ancestor_id: omop:4091623\n", + " ancestor_label: ''\n", + " ancestor_information_content: 10.984781775651545\n", + " jaccard_similarity: 0.25\n", + " phenodigm_score: 1.657164881329823\n", + " match_source_label: Leg repair\n", + " match_target: omop:4050134\n", + " match_target_label: Angioplasty of crural artery\n", + " omop:4002031:\n", + " match_source: omop:4002031\n", + " score: 9.090426815106653\n", + " similarity:\n", + " subject_id: omop:4002031\n", + " object_id: omop:4050128\n", + " ancestor_id: omop:4002031\n", + " ancestor_label: Cardiovascular system repair\n", + " ancestor_information_content: 9.090426815106653\n", + " jaccard_similarity: 0.4166666666666667\n", + " phenodigm_score: 1.946195735178703\n", + " match_source_label: Cardiovascular system repair\n", + " match_target: omop:4050128\n", + " match_target_label: Angioplasty of artery\n", + " omop:4012185:\n", + " match_source: omop:4012185\n", + " score: 8.513920531310507\n", + " similarity:\n", + " subject_id: omop:4012185\n", + " object_id: omop:4054559\n", + " ancestor_id: omop:4012185\n", + " ancestor_label: Cardiovascular surgical procedure\n", + " ancestor_information_content: 8.513920531310507\n", + " jaccard_similarity: 0.25\n", + " phenodigm_score: 1.4589311610996685\n", + " match_source_label: Cardiovascular surgical procedure\n", + " match_target: omop:4054559\n", + " match_target_label: Repair of blood vessel\n", + " omop:4195673:\n", + " match_source: omop:4195673\n", + " score: 18.911289573272484\n", + " similarity:\n", + " subject_id: omop:4195673\n", + " object_id: omop:4050134\n", + " ancestor_id: omop:4050134\n", + " ancestor_label: Angioplasty of crural artery\n", + " ancestor_information_content: 18.911289573272484\n", + " jaccard_similarity: 0.88\n", + " phenodigm_score: 4.079452760417724\n", + " match_source_label: Angioplasty of posterior tibial artery\n", + " match_target: omop:4050134\n", + " match_target_label: Angioplasty of crural artery\n", + "object_best_matches:\n", + " omop:4030028:\n", + " match_source: omop:4030028\n", + " score: 9.734760203165433\n", + " similarity:\n", + " subject_id: omop:4030028\n", + " object_id: omop:4000756\n", + " ancestor_id: omop:4030028\n", + " ancestor_label: Surgical procedure on lower extremity\n", + " ancestor_information_content: 9.734760203165433\n", + " jaccard_similarity: 0.375\n", + " phenodigm_score: 1.910637348160827\n", + " match_source_label: Surgical procedure on lower extremity\n", + " match_target: omop:4000756\n", + " match_target_label: Leg repair\n", + " omop:4050128:\n", + " match_source: omop:4050128\n", + " score: 11.720054065584758\n", + " similarity:\n", + " subject_id: omop:4050128\n", + " object_id: omop:4195673\n", + " ancestor_id: omop:4050128\n", + " ancestor_label: Angioplasty of artery\n", + " ancestor_information_content: 11.720054065584758\n", + " jaccard_similarity: 0.48\n", + " phenodigm_score: 2.371840203614207\n", + " match_source_label: Angioplasty of artery\n", + " match_target: omop:4195673\n", + " match_target_label: Angioplasty of posterior tibial artery\n", + " omop:4050134:\n", + " match_source: omop:4050134\n", + " score: 18.911289573272484\n", + " similarity:\n", + " subject_id: omop:4050134\n", + " object_id: omop:4195673\n", + " ancestor_id: omop:4050134\n", + " ancestor_label: Angioplasty of crural artery\n", + " ancestor_information_content: 18.911289573272484\n", + " jaccard_similarity: 0.88\n", + " phenodigm_score: 4.079452760417724\n", + " match_source_label: Angioplasty of crural artery\n", + " match_target: omop:4195673\n", + " match_target_label: Angioplasty of posterior tibial artery\n", + " omop:4054559:\n", + " match_source: omop:4054559\n", + " score: 9.176700908405415\n", + " similarity:\n", + " subject_id: omop:4054559\n", + " object_id: omop:4195673\n", + " ancestor_id: omop:4054559\n", + " ancestor_label: Repair of blood vessel\n", + " ancestor_information_content: 9.176700908405415\n", + " jaccard_similarity: 0.32\n", + " phenodigm_score: 1.713634818358256\n", + " match_source_label: Repair of blood vessel\n", + " match_target: omop:4195673\n", + " match_target_label: Angioplasty of posterior tibial artery\n", + "average_score: 12.13040293072116\n", + "best_score: 18.911289573272484\n", + "metric: ancestor_information_content\n", + "\n" + ] } - } + ], + "source": [ + "print(yaml_dumper.dumps(tsps))" + ] }, { "cell_type": "code", "execution_count": null, + "id": "83dd744e-bad4-4a1d-80e7-46c77019bf6b", + "metadata": {}, "outputs": [], - "source": [], - "metadata": { - "collapsed": false - } + "source": [] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" } }, "nbformat": 4, diff --git a/notebooks/Commands/TermsetSimilarity.ipynb b/notebooks/Commands/TermsetSimilarity.ipynb index 5349d4771..b93d6335e 100644 --- a/notebooks/Commands/TermsetSimilarity.ipynb +++ b/notebooks/Commands/TermsetSimilarity.ipynb @@ -1,127 +1,115 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, - "id": "c87fdb82", + "cell_type": "markdown", + "id": "658e80e6-0ce8-4576-97ed-0d95f245748a", "metadata": {}, - "outputs": [], "source": [ - "alias hp runoak -i sqlite:obo:hp" + "# OAK termset-similarity command\n", + "\n", + "This notebook is intended as a supplement to the [main OAK CLI docs](https://incatools.github.io/ontology-access-kit/cli.html).\n", + "\n", + "This notebook provides examples for the `termset-similarity` command, which can be used to do an aggregate comparisons between\n", + "two sets of terms (term profiles).\n", + "\n", + "Use cases include:\n", + "\n", + "- comparing two genes based on their GO annotations, or their expression profiles (using Uberon)\n", + "- comparing two patients based on their HPO annotations\n", + "- compare a patient's HPO profile against a mouse allele using its MP profile, using PhenIO as a background\n", + "- comparing two people based on their favorite bands\n", + "\n", + "Note that this command isn't aware of the actual associations themselves - it relies on you to assemble the profile.\n", + "\n", + "The command is general and doesn't make any assumptions about ontology used. The user can control which predicates to use in traversal.\n", + "\n", + "## Help Option\n", + "\n", + "You can get help on any OAK command using `--help`" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "70661358", + "execution_count": 1, + "id": "a89d8429-5175-4eec-a15f-b9920e949831", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "subject_termset:\r\n", - " HP:0100752:\r\n", - " id: HP:0100752\r\n", - " label: Abnormal liver lobulation\r\n", - " HP:0007042:\r\n", - " id: HP:0007042\r\n", - " label: Focal white matter lesions\r\n", - "object_termset:\r\n", - " HP:0006555:\r\n", - " id: HP:0006555\r\n", - " label: Diffuse hepatic steatosis\r\n", - " HP:0025517:\r\n", - " id: HP:0025517\r\n", - " label: Hypoplastic hippocampus\r\n", - "subject_best_matches:\r\n", - " HP:0007042:\r\n", - " match_source: HP:0007042\r\n", - " score: 6.775984316965229\r\n", - " similarity:\r\n", - " subject_id: HP:0007042\r\n", - " object_id: HP:0025517\r\n", - " ancestor_id: HP:0100547\r\n", - " ancestor_label: Abnormal forebrain morphology\r\n", - " ancestor_information_content: 6.775984316965229\r\n", - " jaccard_similarity: 0.5\r\n", - " phenodigm_score: 1.8406499282814792\r\n", - " match_source_label: Focal white matter lesions\r\n", - " match_target: HP:0025517\r\n", - " match_target_label: Hypoplastic hippocampus\r\n", - " HP:0100752:\r\n", - " match_source: HP:0100752\r\n", - " score: 8.632074905566515\r\n", - " similarity:\r\n", - " subject_id: HP:0100752\r\n", - " object_id: HP:0006555\r\n", - " ancestor_id: HP:0410042\r\n", - " ancestor_label: Abnormal liver morphology\r\n", - " ancestor_information_content: 8.632074905566515\r\n", - " jaccard_similarity: 0.5\r\n", - " phenodigm_score: 2.0775075096815554\r\n", - " match_source_label: Abnormal liver lobulation\r\n", - " match_target: HP:0006555\r\n", - " match_target_label: Diffuse hepatic steatosis\r\n", - "object_best_matches:\r\n", - " HP:0006555:\r\n", - " match_source: HP:0006555\r\n", - " score: 8.632074905566515\r\n", - " similarity:\r\n", - " subject_id: HP:0100752\r\n", - " object_id: HP:0006555\r\n", - " ancestor_id: HP:0410042\r\n", - " ancestor_label: Abnormal liver morphology\r\n", - " ancestor_information_content: 8.632074905566515\r\n", - " jaccard_similarity: 0.5\r\n", - " phenodigm_score: 2.0775075096815554\r\n", - " match_source_label: Diffuse hepatic steatosis\r\n", - " match_target: HP:0100752\r\n", - " match_target_label: Abnormal liver lobulation\r\n", - " HP:0025517:\r\n", - " match_source: HP:0025517\r\n", - " score: 6.775984316965229\r\n", - " similarity:\r\n", - " subject_id: HP:0007042\r\n", - " object_id: HP:0025517\r\n", - " ancestor_id: HP:0100547\r\n", - " ancestor_label: Abnormal forebrain morphology\r\n", - " ancestor_information_content: 6.775984316965229\r\n", - " jaccard_similarity: 0.5\r\n", - " phenodigm_score: 1.8406499282814792\r\n", - " match_source_label: Hypoplastic hippocampus\r\n", - " match_target: HP:0007042\r\n", - " match_target_label: Focal white matter lesions\r\n", - "average_score: 7.704029611265872\r\n", - "best_score: 8.632074905566515\r\n" + "Usage: runoak termset-similarity [OPTIONS] [TERMS]...\n", + "\n", + " Termset similarity.\n", + "\n", + " This calculates a similarity matrix for two sets of terms.\n", + "\n", + " Example:\n", + "\n", + " runoak -i go.db termset-similarity -p i,p nucleus membrane @ \"nuclear\n", + " membrane\" vacuole -p i,p\n", + "\n", + " Python API:\n", + "\n", + " https://incatools.github.io/ontology-access-kit/interfaces/semantic-\n", + " similarity\n", + "\n", + " Data model:\n", + "\n", + " https://w3id.org/oak/similarity\n", + "\n", + "Options:\n", + " -p, --predicates TEXT A comma-separated list of predicates. This may\n", + " be a shorthand (i, p) or CURIE\n", + " -o, --output FILENAME Output file, e.g. obo file\n", + " -O, --output-type TEXT Desired output type\n", + " --autolabel / --no-autolabel If set, results will automatically have labels\n", + " assigned [default: autolabel]\n", + " --help Show this message and exit.\n" ] } ], "source": [ - "hp termset-similarity \"Abnormal liver lobulation\" \"Focal white matter lesions\" @ \"Diffuse hepatic steatosis\" \"Hypoplastic hippocampus\"" + "!runoak termset-similarity --help" + ] + }, + { + "cell_type": "markdown", + "id": "e2645351-06f6-4c89-9125-b38a682adb39", + "metadata": {}, + "source": [ + "## Set up an alias for HPO" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "e1138d8f", + "execution_count": 2, + "id": "c87fdb82", "metadata": {}, "outputs": [], "source": [ - "# alias hp2 runoak -i semsimian:sqlite:obo:hp" + "alias hp runoak -i sqlite:obo:hp" + ] + }, + { + "cell_type": "markdown", + "id": "ba203aeb-d935-49b6-b5bf-b27b55d40f2a", + "metadata": {}, + "source": [ + "## Compare two phenotype profiles" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "8838921c", + "execution_count": 3, + "id": "70661358", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2K[00:00:00] Building all X all pairwise similarity: \u001b[36m████████████████████████████████████████\u001b[34m\u001b[0m\u001b[0m 100%Building all X all pairwise similarity: \u001b[36m████████████████████░\u001b[34m░░░░░░░░░░░░░░░░░░░\u001b[0m\u001b[0m 50%subject_termset:\n", + "subject_termset:\n", " HP:0100752:\n", " id: HP:0100752\n", " label: Abnormal liver lobulation\n", @@ -136,70 +124,173 @@ " id: HP:0025517\n", " label: Hypoplastic hippocampus\n", "subject_best_matches:\n", + " HP:0007042:\n", + " match_source: HP:0007042\n", + " score: 6.775984316965229\n", + " similarity:\n", + " subject_id: HP:0007042\n", + " object_id: HP:0025517\n", + " ancestor_id: HP:0100547\n", + " ancestor_label: Abnormal forebrain morphology\n", + " ancestor_information_content: 6.775984316965229\n", + " jaccard_similarity: 0.5\n", + " phenodigm_score: 1.8406499282814792\n", + " match_source_label: Focal white matter lesions\n", + " match_target: HP:0025517\n", + " match_target_label: Hypoplastic hippocampus\n", " HP:0100752:\n", " match_source: HP:0100752\n", - " score: 8.634374534720992\n", + " score: 8.632074905566515\n", " similarity:\n", " subject_id: HP:0100752\n", " object_id: HP:0006555\n", " ancestor_id: HP:0410042\n", " ancestor_label: Abnormal liver morphology\n", - " ancestor_information_content: 8.634374534720992\n", + " ancestor_information_content: 8.632074905566515\n", " jaccard_similarity: 0.5\n", - " phenodigm_score: 2.077784220596666\n", + " phenodigm_score: 2.0775075096815554\n", " match_source_label: Abnormal liver lobulation\n", " match_target: HP:0006555\n", " match_target_label: Diffuse hepatic steatosis\n", + "object_best_matches:\n", + " HP:0006555:\n", + " match_source: HP:0006555\n", + " score: 8.632074905566515\n", + " similarity:\n", + " subject_id: HP:0100752\n", + " object_id: HP:0006555\n", + " ancestor_id: HP:0410042\n", + " ancestor_label: Abnormal liver morphology\n", + " ancestor_information_content: 8.632074905566515\n", + " jaccard_similarity: 0.5\n", + " phenodigm_score: 2.0775075096815554\n", + " match_source_label: Diffuse hepatic steatosis\n", + " match_target: HP:0100752\n", + " match_target_label: Abnormal liver lobulation\n", + " HP:0025517:\n", + " match_source: HP:0025517\n", + " score: 6.775984316965229\n", + " similarity:\n", + " subject_id: HP:0007042\n", + " object_id: HP:0025517\n", + " ancestor_id: HP:0100547\n", + " ancestor_label: Abnormal forebrain morphology\n", + " ancestor_information_content: 6.775984316965229\n", + " jaccard_similarity: 0.5\n", + " phenodigm_score: 1.8406499282814792\n", + " match_source_label: Hypoplastic hippocampus\n", + " match_target: HP:0007042\n", + " match_target_label: Focal white matter lesions\n", + "average_score: 7.704029611265872\n", + "best_score: 8.632074905566515\n" + ] + } + ], + "source": [ + "hp termset-similarity \"Abnormal liver lobulation\" \"Focal white matter lesions\" @ \"Diffuse hepatic steatosis\" \"Hypoplastic hippocampus\"" + ] + }, + { + "cell_type": "markdown", + "id": "e9839eb8-9dbb-445b-b0d5-168ad92b38b0", + "metadata": {}, + "source": [ + "## Faster comparisons using Rust\n", + "\n", + "OAK has the ability to use semsimian to use a more efficient semantic similarity implementation under the hood" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8838921c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[1A[00:00:00] Building (all subjects X all objects) pairwise similarity: \u001b[36m████████████████████████████████████████\u001b[34m\u001b[0m\u001b[0m 100%\u001b[1Aing (all subjects X all objects) pairwise similarity: \u001b[36m████████████████████░\u001b[34m░░░░░░░░░░░░░░░░░░░\u001b[0m\u001b[0m 50%\u001b[1AWARNING:root:Adding labels not yet implemented in SemsimianImplementation.\n", + "subject_termset:\n", + " HP:0007042:\n", + " id: HP:0007042\n", + " label: Focal white matter lesions\n", + " HP:0100752:\n", + " id: HP:0100752\n", + " label: Abnormal liver lobulation\n", + "object_termset:\n", + " HP:0025517:\n", + " id: HP:0025517\n", + " label: Hypoplastic hippocampus\n", + " HP:0006555:\n", + " id: HP:0006555\n", + " label: Diffuse hepatic steatosis\n", + "subject_best_matches:\n", " HP:0007042:\n", " match_source: HP:0007042\n", - " score: 6.778283946119706\n", + " score: 6.7759382869726945\n", " similarity:\n", " subject_id: HP:0007042\n", " object_id: HP:0025517\n", " ancestor_id: HP:0100547\n", - " ancestor_label: Abnormal forebrain morphology\n", - " ancestor_information_content: 6.778283946119706\n", + " ancestor_label: ''\n", + " ancestor_information_content: 6.7759382869726945\n", " jaccard_similarity: 0.5\n", - " phenodigm_score: 1.8409622410739046\n", + " phenodigm_score: 1.8406436764040854\n", " match_source_label: Focal white matter lesions\n", " match_target: HP:0025517\n", " match_target_label: Hypoplastic hippocampus\n", + " HP:0100752:\n", + " match_source: HP:0100752\n", + " score: 8.632028875573981\n", + " similarity:\n", + " subject_id: HP:0100752\n", + " object_id: HP:0006555\n", + " ancestor_id: HP:0410042\n", + " ancestor_label: ''\n", + " ancestor_information_content: 8.632028875573981\n", + " jaccard_similarity: 0.5\n", + " phenodigm_score: 2.0775019705855855\n", + " match_source_label: Abnormal liver lobulation\n", + " match_target: HP:0006555\n", + " match_target_label: Diffuse hepatic steatosis\n", "object_best_matches:\n", " HP:0006555:\n", " match_source: HP:0006555\n", - " score: 8.634374534720992\n", + " score: 8.632028875573981\n", " similarity:\n", - " subject_id: HP:0100752\n", - " object_id: HP:0006555\n", + " subject_id: HP:0006555\n", + " object_id: HP:0100752\n", " ancestor_id: HP:0410042\n", - " ancestor_label: Abnormal liver morphology\n", - " ancestor_information_content: 8.634374534720992\n", + " ancestor_label: ''\n", + " ancestor_information_content: 8.632028875573981\n", " jaccard_similarity: 0.5\n", - " phenodigm_score: 2.077784220596666\n", + " phenodigm_score: 2.0775019705855855\n", " match_source_label: Diffuse hepatic steatosis\n", " match_target: HP:0100752\n", " match_target_label: Abnormal liver lobulation\n", " HP:0025517:\n", " match_source: HP:0025517\n", - " score: 6.778283946119706\n", + " score: 6.7759382869726945\n", " similarity:\n", - " subject_id: HP:0007042\n", - " object_id: HP:0025517\n", + " subject_id: HP:0025517\n", + " object_id: HP:0007042\n", " ancestor_id: HP:0100547\n", - " ancestor_label: Abnormal forebrain morphology\n", - " ancestor_information_content: 6.778283946119706\n", + " ancestor_label: ''\n", + " ancestor_information_content: 6.7759382869726945\n", " jaccard_similarity: 0.5\n", - " phenodigm_score: 1.8409622410739046\n", + " phenodigm_score: 1.8406436764040854\n", " match_source_label: Hypoplastic hippocampus\n", " match_target: HP:0007042\n", " match_target_label: Focal white matter lesions\n", - "average_score: 7.706329240420349\n", - "best_score: 8.634374534720992\n" + "average_score: 7.703983581273338\n", + "best_score: 8.632028875573981\n", + "metric: ancestor_information_content\n" ] } ], "source": [ - "!runoak -i semsimian:sqlite:obo:hp termset-similarity \"Abnormal liver lobulation\" \"Focal white matter lesions\" @ \"Diffuse hepatic steatosis\" \"Hypoplastic hippocampus\"" + "!runoak -i semsimian:sqlite:obo:hp termset-similarity -p i \"Abnormal liver lobulation\" \"Focal white matter lesions\" @ \"Diffuse hepatic steatosis\" \"Hypoplastic hippocampus\"" ] }, { diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 13580e76e..001461ae7 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -147,6 +147,10 @@ logical_definition_analyzer, logical_definition_summarizer, ) +from oaklib.utilities.axioms.disjointness_axiom_analyzer import ( + DisjointnessInducerConfig, + generate_disjoint_class_expressions_axioms, +) from oaklib.utilities.iterator_utils import chunk from oaklib.utilities.kgcl_utilities import ( generate_change_id, @@ -3420,6 +3424,82 @@ def _exclude_ldef(ldef: LogicalDefinitionAxiom) -> bool: writer.file.close() +@main.command() +@click.argument("terms", nargs=-1) +@predicates_option +@autolabel_option +@output_type_option +@click.option( + "--named-classes-only/--no-named-classes-only", + default=False, + show_default=True, + help="Only show disjointness axioms between two named classes.", +) +@output_option +def disjoints( + terms, + predicates: str, + autolabel: bool, + output_type: str, + named_classes_only: bool, + output: str, +): + """ + Show all disjoints for a set of terms, or whole ontology. + + Leave off all arguments for defaults - all terms, YAML OboGraph model + serialization: + + Example: + + runoak -i sqlite:obo:uberon disjoints + + Note that this will include pairwise disjoints, setwise disjoints, + disjoint unions, and disjoints involving simple class expressions. + + A tabular format can be easier to browse, and includes labels by default: + + Example: + + runoak -i sqlite:obo:uberon disjoints --autolabel -O csv + + To perform this on a subset: + + Example: + + runoak -i sqlite:obo:cl disjoints --autolabel -O csv .desc//p=i "immune cell" + + Data model: + + https://w3id.org/oak/obograph + """ + impl = settings.impl + writer = _get_writer(output_type, impl, StreamingYamlWriter) + writer.output = output + writer.autolabel = autolabel + actual_predicates = _process_predicates_arg(predicates) + + label_fields = [ + "classIds", + "classExpressionPropertyIds", + "classExpressionFillerIds", + "unionEquivalentToFillerId", + "unionEquivalentToPropertyId", + ] + if not isinstance(impl, OboGraphInterface): + raise NotImplementedError(f"Cannot execute this using {type(impl)}") + if terms == ".all": + terms = None + term_it = query_terms_iterator(terms, impl) if terms else None + dxas = impl.disjoint_class_expressions_axioms(term_it, predicates=actual_predicates) + for dxa in dxas: + if named_classes_only and dxa.classExpressions: + continue + writer.emit(dxa, label_fields=label_fields) + writer.finish() + writer.file.close() + + @main.command() @output_option @output_type_option @@ -6038,5 +6118,73 @@ def generate_logical_definitions( writer.file.close() +@main.command() +@click.argument("terms", nargs=-1) +@autolabel_option +@output_option +@predicates_option +@output_type_option +@click.option( + "--min-descendants", + "-M", + default=3, + show_default=True, + help="Minimum number of descendants for a class to have to be considered a candidate.", +) +@click.option( + "--exclude-existing/--no-exclude-existing", + default=True, + show_default=True, + help="Do not report duplicates with existing disjointness axioms.", +) +def generate_disjoints( + terms, + predicates, + autolabel, + output, + output_type, + exclude_existing, + min_descendants, +): + """ + Generate candidate disjointness axioms. + + Example: + + runoak -i sqlite:obo:iao generate-disjoints -O obo + + To generate spatial disjointness axioms: + + runoak -i sqlite:obo:zfa generate-disjoints -O obo p i,p + """ + impl = settings.impl + writer = _get_writer(output_type, impl, StreamingYamlWriter, kgcl) + writer.output = output + writer.autolabel = autolabel + if not isinstance(impl, OboGraphInterface): + raise NotImplementedError + curies = list(query_terms_iterator(terms, impl)) + actual_predicates = _process_predicates_arg(predicates) + if not actual_predicates: + actual_predicates = [IS_A] + config = DisjointnessInducerConfig( + min_descendants=min_descendants, exclude_existing=exclude_existing + ) + dxas = generate_disjoint_class_expressions_axioms( + impl, curies, [actual_predicates], config=config + ) + label_fields = [ + "classIds", + "classExpressionPropertyIds", + "classExpressionFillerIds", + "unionEquivalentToFillerId", + "unionEquivalentToPropertyId", + ] + for dxa in dxas: + writer.emit(dxa, label_fields=label_fields) + writer.finish() + writer.file.close() + + if __name__ == "__main__": main() diff --git a/src/oaklib/datamodels/obograph.py b/src/oaklib/datamodels/obograph.py index f9c4d2eea..026ea87be 100644 --- a/src/oaklib/datamodels/obograph.py +++ b/src/oaklib/datamodels/obograph.py @@ -1,20 +1,15 @@ -# Auto generated from obograph.yaml by pythongen.py version: 0.9.0 -# Generation date: 2023-05-13T10:22:20 +# Auto generated from obograph.yaml by pythongen.py version: 0.0.1 +# Generation date: 2023-08-29T10:52:15 # Schema: obographs_datamodel # # id: https://github.com/geneontology/obographs -# description: A data model for graph-oriented representations of ontologies. Each ontology is represented as a -# Graph, and multiple ontologies can be connected together in a GraphDocument. The principle elements -# of a Graph are Node objects and Edge objects. A Node represents an arbitrary ontology element, -# including but not limited to the core terms in the ontology. Edges represent simple relationships -# between Nodes. Nodes and Edges can both have Meta objects attached, providing additional metedata. -# Not everything in an ontology can be represented as nodes and edges. More complex axioms have -# specialized structures such as DomainRangeAxiom objects and LogicalDefinitionAxiom. +# description: A data model for graph-oriented representations of ontologies. Each ontology is represented as a Graph, and multiple ontologies can be connected together in a GraphDocument. +# The principle elements of a Graph are Node objects and Edge objects. A Node represents an arbitrary ontology element, including but not limited to the core terms in the ontology. Edges represent simple relationships between Nodes. Nodes and Edges can both have Meta objects attached, providing additional metedata. +# Not everything in an ontology can be represented as nodes and edges. More complex axioms have specialized structures such as DomainRangeAxiom objects and LogicalDefinitionAxiom. # license: https://creativecommons.org/publicdomain/zero/1.0/ import dataclasses import re -import sys from dataclasses import dataclass from typing import Any, ClassVar, Dict, List, Optional, Union @@ -819,6 +814,59 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): super().__post_init__(**kwargs) +@dataclass +class DisjointClassExpressionsAxiom(Axiom): + """ + An axiom that defines a set of classes or class expressions as being mutually disjoint. Formally, there exists no + instance that instantiates more that one of the union of classIds and classExpressions. + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = OBOGRAPHS.DisjointClassExpressionsAxiom + class_class_curie: ClassVar[str] = "obographs:DisjointClassExpressionsAxiom" + class_name: ClassVar[str] = "DisjointClassExpressionsAxiom" + class_model_uri: ClassVar[URIRef] = OBOGRAPHS.DisjointClassExpressionsAxiom + + classIds: Optional[Union[str, List[str]]] = empty_list() + classExpressions: Optional[ + Union[ + Union[dict, ExistentialRestrictionExpression], + List[Union[dict, ExistentialRestrictionExpression]], + ] + ] = empty_list() + unionEquivalentTo: Optional[str] = None + unionEquivalentToExpression: Optional[Union[dict, ExistentialRestrictionExpression]] = None + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if not isinstance(self.classIds, list): + self.classIds = [self.classIds] if self.classIds is not None else [] + self.classIds = [v if isinstance(v, str) else str(v) for v in self.classIds] + + if not isinstance(self.classExpressions, list): + self.classExpressions = ( + [self.classExpressions] if self.classExpressions is not None else [] + ) + self.classExpressions = [ + v + if isinstance(v, ExistentialRestrictionExpression) + else ExistentialRestrictionExpression(**as_dict(v)) + for v in self.classExpressions + ] + + if self.unionEquivalentTo is not None and not isinstance(self.unionEquivalentTo, str): + self.unionEquivalentTo = str(self.unionEquivalentTo) + + if self.unionEquivalentToExpression is not None and not isinstance( + self.unionEquivalentToExpression, ExistentialRestrictionExpression + ): + self.unionEquivalentToExpression = ExistentialRestrictionExpression( + **as_dict(self.unionEquivalentToExpression) + ) + + super().__post_init__(**kwargs) + + @dataclass class PropertyChainAxiom(Axiom): """ @@ -873,7 +921,7 @@ class ScopeEnum(EnumDefinitionImpl): ) hasRelatedSynonym = PermissibleValue( text="hasRelatedSynonym", - description="The synonym represents something closely related in meaning than the node, but in not exact, broad, or narrow.", + description="""The synonym represents something closely related in meaning than the node, but in not exact, broad, or narrow.""", meaning=OIO.hasRelatedSynonym, ) @@ -1118,6 +1166,20 @@ class slots: ], ) +slots.disjointClassExpressionsAxioms = Slot( + uri=OBOGRAPHS.disjointClassExpressionsAxioms, + name="disjointClassExpressionsAxioms", + curie=OBOGRAPHS.curie("disjointClassExpressionsAxioms"), + model_uri=OBOGRAPHS.disjointClassExpressionsAxioms, + domain=None, + range=Optional[ + Union[ + Union[dict, DisjointClassExpressionsAxiom], + List[Union[dict, DisjointClassExpressionsAxiom]], + ] + ], +) + slots.domainRangeAxioms = Slot( uri=OBOGRAPHS.domainRangeAxioms, name="domainRangeAxioms", @@ -1347,6 +1409,47 @@ class slots: ], ) +slots.disjointClassExpressionsAxiom__classIds = Slot( + uri=OBOGRAPHS.classIds, + name="disjointClassExpressionsAxiom__classIds", + curie=OBOGRAPHS.curie("classIds"), + model_uri=OBOGRAPHS.disjointClassExpressionsAxiom__classIds, + domain=None, + range=Optional[Union[str, List[str]]], +) + +slots.disjointClassExpressionsAxiom__classExpressions = Slot( + uri=OBOGRAPHS.classExpressions, + name="disjointClassExpressionsAxiom__classExpressions", + curie=OBOGRAPHS.curie("classExpressions"), + model_uri=OBOGRAPHS.disjointClassExpressionsAxiom__classExpressions, + domain=None, + range=Optional[ + Union[ + Union[dict, ExistentialRestrictionExpression], + List[Union[dict, ExistentialRestrictionExpression]], + ] + ], +) + +slots.disjointClassExpressionsAxiom__unionEquivalentTo = Slot( + uri=OBOGRAPHS.unionEquivalentTo, + name="disjointClassExpressionsAxiom__unionEquivalentTo", + curie=OBOGRAPHS.curie("unionEquivalentTo"), + model_uri=OBOGRAPHS.disjointClassExpressionsAxiom__unionEquivalentTo, + domain=None, + range=Optional[str], +) + +slots.disjointClassExpressionsAxiom__unionEquivalentToExpression = Slot( + uri=OBOGRAPHS.unionEquivalentToExpression, + name="disjointClassExpressionsAxiom__unionEquivalentToExpression", + curie=OBOGRAPHS.curie("unionEquivalentToExpression"), + model_uri=OBOGRAPHS.disjointClassExpressionsAxiom__unionEquivalentToExpression, + domain=None, + range=Optional[Union[dict, ExistentialRestrictionExpression]], +) + slots.Meta_xrefs = Slot( uri=OBOGRAPHS.xrefs, name="Meta_xrefs", diff --git a/src/oaklib/datamodels/obograph.yaml b/src/oaklib/datamodels/obograph.yaml index f7fe6500f..cd56fd3e1 100644 --- a/src/oaklib/datamodels/obograph.yaml +++ b/src/oaklib/datamodels/obograph.yaml @@ -251,6 +251,13 @@ slots: range: LogicalDefinitionAxiom description: >- A list of logical definition axioms that define the meaning of a class in terms of other classes. + disjointClassExpressionsAxioms: + multivalued: true + inlined_as_list: true + range: DisjointClassExpressionsAxiom + description: >- + A list of logical disjointness axioms that specify that a class or class expression + is disjoint from other classes or class expressions. domainRangeAxioms: multivalued: true range: DomainRangeAxiom @@ -583,6 +590,35 @@ classes: see_also: - https://github.com/geneontology/obographs/issues/89 + DisjointClassExpressionsAxiom: + aliases: + - disjoint classes + description: >- + An axiom that defines a set of classes or class expressions as being mutually disjoint. + Formally, there exists no instance that instantiates more that one of the union of + classIds and classExpressions. + is_a: Axiom + attributes: + classIds: + description: >- + The set of named classes that are mutually disjoint. + multivalued: true + classExpressions: + description: >- + The set of class expressions that are mutually disjoint. + comments: + - currently restricted to existential restrictions (some values from) + range: ExistentialRestrictionExpression + multivalued: true + unionEquivalentTo: + description: >- + If present, this equates to an OWL DisjointUnion expression. + unionEquivalentToExpression: + range: ExistentialRestrictionExpression + description: >- + if present, this class expression is equivalent ot the (disjoint) union of + the classIds and classExpressions. + PropertyChainAxiom: is_a: Axiom description: >- diff --git a/src/oaklib/implementations/sqldb/sql_implementation.py b/src/oaklib/implementations/sqldb/sql_implementation.py index 73aba0bef..d6e10ae36 100644 --- a/src/oaklib/implementations/sqldb/sql_implementation.py +++ b/src/oaklib/implementations/sqldb/sql_implementation.py @@ -44,6 +44,7 @@ ObjectPropertyNode, OntologyNode, OwlAxiomAnnotation, + OwlDisjointClassStatement, OwlEquivalentClassStatement, OwlSomeValuesFrom, Prefix, @@ -66,6 +67,7 @@ from oaklib.datamodels import obograph, ontology_metadata from oaklib.datamodels.association import Association from oaklib.datamodels.obograph import ( + DisjointClassExpressionsAxiom, ExistentialRestrictionExpression, LogicalDefinitionAxiom, ) @@ -964,7 +966,7 @@ def _tbox_relationships( if not include_dangling: subq = self.session.query(Statements.subject) q = q.filter(tbl.object.in_(subq)) - logging.info(f"Tbox query: {q}") + logging.debug(f"Tbox query: {q}") for row in q: yield row.subject, row.predicate, row.object @@ -989,7 +991,7 @@ def _object_property_assertion_relationships( q = q.filter(Statements.predicate.in_(op_subq)) if objects: q = q.filter(Statements.object.in_(tuple(objects))) - logging.info(f"Abox query: {q}") + logging.debug(f"Abox query: {q}") for row in q: if not row.object: # edge case: see https://github.com/monarch-initiative/phenio/issues/36 @@ -1056,7 +1058,7 @@ def _rbox_relationships( q = q.filter(Statements.subject.in_(tuple(subjects))) if objects: q = q.filter(Statements.object.in_(tuple(objects))) - logging.info(f"RBOX query: {q}") + logging.debug(f"RBOX query: {q}") for row in q: yield row.subject, row.predicate, row.object @@ -1567,6 +1569,81 @@ def _logical_definitions_from_eq_query( continue yield ldef + def _node_to_class_expression( + self, node: str + ) -> Optional[Union[CURIE, ExistentialRestrictionExpression]]: + if not _is_blank(node): + return node + + svfq = self.session.query(OwlSomeValuesFrom).filter(OwlSomeValuesFrom.id == node) + svfq = list(svfq) + if svfq: + if len(svfq) > 1: + raise ValueError(f"Incorrect rdf structure for equiv axioms for {node}") + svf = svfq[0] + return ExistentialRestrictionExpression(propertyId=svf.on_property, fillerId=svf.filler) + else: + return None + + def disjoint_class_expressions_axioms( + self, + subjects: Optional[Iterable[CURIE]] = None, + predicates: Iterable[PRED_CURIE] = None, + group=False, + **kwargs, + ) -> Iterable[DisjointClassExpressionsAxiom]: + logging.info("Getting disjoint class expression axioms") + q = self.session.query(OwlDisjointClassStatement) + if predicates is not None: + predicates = list(predicates) + if subjects: + subjects = list(subjects) + axs = [] + for da in q: + # blank + sx = self._node_to_class_expression(da.subject) + ox = self._node_to_class_expression(da.object) + allx = [sx, ox] + allx_named = [x for x in allx if isinstance(x, str)] + allx_exprs = [x for x in allx if isinstance(x, ExistentialRestrictionExpression)] + allx_fillers = [x.fillerId for x in allx_exprs] + if subjects: + if not any(x for x in allx_named if x in subjects) and not any( + x for x in allx_fillers if x in subjects + ): + continue + if predicates: + if not any(x for x in allx_exprs if x.propertyId in predicates): + continue + ax = DisjointClassExpressionsAxiom( + classIds=allx_named, + classExpressions=allx_exprs, + ) + axs.append(ax) + q = self.session.query(RdfTypeStatement.subject).filter( + RdfTypeStatement.object == "owl:AllDisjointClasses" + ) + for adc in q: + class_ids = [] + class_exprs = [] + for (m,) in self.session.query(Statements.object).filter( + and_( + Statements.subject == adc.subject, + Statements.predicate == "owl:members", + ) + ): + if isinstance(m, str): + class_ids.append(m) + else: + class_exprs.append(self._node_to_class_expression(m)) + axs.append( + DisjointClassExpressionsAxiom( + classIds=class_ids, + classExpressions=class_exprs, + ) + ) + yield from axs + # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Implements: RelationGraphInterface # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/oaklib/interfaces/obograph_interface.py b/src/oaklib/interfaces/obograph_interface.py index 6fefea325..1fbef629f 100644 --- a/src/oaklib/interfaces/obograph_interface.py +++ b/src/oaklib/interfaces/obograph_interface.py @@ -8,6 +8,7 @@ from sssom.constants import RDFS_SUBCLASS_OF, RDFS_SUBPROPERTY_OF from oaklib.datamodels.obograph import ( + DisjointClassExpressionsAxiom, Edge, Graph, LogicalDefinitionAxiom, @@ -530,6 +531,24 @@ def logical_definitions( """ return iter(()) + def disjoint_class_expressions_axioms( + self, + subjects: Optional[Iterable[CURIE]] = None, + predicates: Iterable[PRED_CURIE] = None, + group=False, + **kwargs, + ) -> Iterable[DisjointClassExpressionsAxiom]: + """ + Yields all disjoint class expressions. + + :param subjects: if present, filter to only those that reference these subjects + :param predicates: if present, filter to only those that reference these predicates + :param group: if True, group into cliques + :param kwargs: + :return: + """ + return iter(()) + def add_metadata(self, graph: Graph) -> None: """ Decorates the graph with meta objects on all nodes diff --git a/src/oaklib/io/streaming_csv_writer.py b/src/oaklib/io/streaming_csv_writer.py index c9a9a4de6..536eef4f1 100644 --- a/src/oaklib/io/streaming_csv_writer.py +++ b/src/oaklib/io/streaming_csv_writer.py @@ -1,5 +1,6 @@ import csv import logging +from copy import copy from dataclasses import dataclass, field from typing import Any, Dict, List, Mapping, Type, Union @@ -47,7 +48,7 @@ def emit(self, obj: Union[YAMLRoot, Dict, CURIE], label_fields=None): elif isinstance(obj, CURIE): obj_as_dict = self._get_dict(obj) else: - obj_as_dict = vars(obj) + obj_as_dict = copy(vars(obj)) self._rewrite_dict(obj_as_dict, obj) obj_as_dict = self.add_labels(obj_as_dict, label_fields) heterogeneous_keys = self.heterogeneous_keys or self.pivot_fields @@ -151,6 +152,7 @@ def _rewrite_dict(self, obj_as_dict: dict, original: Any): :param original: :return: """ + # TODO: do this more generically using a transformer if isinstance(original, obograph.LogicalDefinitionAxiom): restrictions = original.restrictions obj_as_dict["genusIds"] = "|".join(original.genusIds) @@ -160,6 +162,26 @@ def _rewrite_dict(self, obj_as_dict: dict, original: Any): [f"{r.propertyId}={r.fillerId}" for r in original.restrictions] ) del obj_as_dict["meta"] + if isinstance(original, obograph.DisjointClassExpressionsAxiom): + # obj_as_dict["classIds"] = original.classIds + obj_as_dict["classExpressionPropertyIds"] = [ + r.propertyId for r in original.classExpressions + ] + obj_as_dict["classExpressionFillerIds"] = [ + r.fillerId for r in original.classExpressions + ] + del obj_as_dict["classExpressions"] + # if original.unionEquivalentTo: + # obj_as_dict["unionEquivalentTo"] = original.unionEquivalentTo + if original.unionEquivalentToExpression: + obj_as_dict[ + "unionEquivalentToFillerId" + ] = original.unionEquivalentToExpression.fillerId + obj_as_dict[ + "unionEquivalentToPropertyId" + ] = original.unionEquivalentToExpression.propertyId + del obj_as_dict["unionEquivalentToExpression"] + del obj_as_dict["meta"] if isinstance(original, summary_stats.UngroupedStatistics): for slot in [ "edge_count_by_predicate", diff --git a/src/oaklib/io/streaming_obo_writer.py b/src/oaklib/io/streaming_obo_writer.py index 52f0be59b..e0dd28084 100644 --- a/src/oaklib/io/streaming_obo_writer.py +++ b/src/oaklib/io/streaming_obo_writer.py @@ -7,7 +7,11 @@ from oaklib.converters.obo_graph_to_obo_format_converter import ( OboGraphToOboFormatConverter, ) -from oaklib.datamodels.obograph import GraphDocument, LogicalDefinitionAxiom +from oaklib.datamodels.obograph import ( + DisjointClassExpressionsAxiom, + GraphDocument, + LogicalDefinitionAxiom, +) from oaklib.datamodels.vocabulary import IS_A, RDF_TYPE, SYNONYM_PRED_TO_SCOPE_MAP from oaklib.interfaces.metadata_interface import MetadataInterface from oaklib.interfaces.obograph_interface import OboGraphInterface @@ -97,3 +101,19 @@ def emit_obj(self, obj: Any, **kwargs): for r in obj.restrictions: self.line(f"intersection_of: {r.propertyId} {r.fillerId} ! {oi.label(r.fillerId)}") self.line("\n") + elif isinstance(obj, DisjointClassExpressionsAxiom): + if len(obj.classIds) > 1: + for i1, c1 in enumerate(obj.classIds): + for i2, c2 in enumerate(obj.classIds): + if i1 >= i2: + continue + self.line("[Term]") + self.line(f"id: {c1} ! {oi.label(c1)}") + self.line(f"disjoint_from: {c2} ! {oi.label(c2)}") + self.line("\n") + else: + logging.warning( + f"Skipping DisjointClassExpressionsAxiom with only one class: {obj}" + ) + else: + raise NotImplementedError(f"Cannot emit type {type(obj)} {obj}") diff --git a/src/oaklib/utilities/axioms/disjointness_axiom_analyzer.py b/src/oaklib/utilities/axioms/disjointness_axiom_analyzer.py new file mode 100644 index 000000000..0fdc0e777 --- /dev/null +++ b/src/oaklib/utilities/axioms/disjointness_axiom_analyzer.py @@ -0,0 +1,228 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Iterator, List, Optional, Tuple + +from oaklib.datamodels.obograph import ( + DisjointClassExpressionsAxiom, + ExistentialRestrictionExpression, +) +from oaklib.datamodels.vocabulary import IS_A +from oaklib.interfaces import OboGraphInterface +from oaklib.types import CURIE, PRED_CURIE + +CACHE = Dict[PRED_CURIE, Dict[CURIE, List[CURIE]]] + + +@dataclass +class DisjointnessInducerConfig: + min_descendants: int = field(default=3) + exclude_existing: bool = field(default=True) + exclude_if_subsumed: bool = field(default=True) + genus_terms_are_candidates: bool = field(default=True) + + +def _descendants( + adapter: OboGraphInterface, + c: CURIE, + predicates: Optional[List[PRED_CURIE]] = None, + cache: Optional[CACHE] = None, +) -> List[CURIE]: + """ + Return the descendants of a class. + + :param adapter: + :param c: + :param cache: + :return: + """ + if cache is None: + cache = {} + preds = tuple(predicates or [IS_A]) + if preds not in cache: + cache[preds] = {} + pcache = cache[preds] + if c not in pcache: + pcache[c] = list(adapter.descendants(c, predicates)) + return pcache[c] + + +def underlap( + adapter: OboGraphInterface, + class1: CURIE, + class2: CURIE, + predicates: Optional[List[PRED_CURIE]] = None, + cache: Optional[CACHE] = None, +) -> Tuple[int, int, int]: + """ + Compute the underlap between two classes. + + :param adapter: + :param class1: + :param class2: + :param predicates: + :param cache: + :return: tuple of underlap, |C1|, |C2| + """ + if predicates is None: + predicates = [IS_A] + desc1 = _descendants(adapter, class1, predicates=predicates, cache=cache) + desc2 = _descendants(adapter, class2, predicates=predicates, cache=cache) + return len(set(desc1).intersection(set(desc2))), len(desc1), len(desc2) + + +def generate_underlaps( + adapter: OboGraphInterface, + roots: Optional[List[CURIE]] = None, + predicate_sets: Optional[List[List[PRED_CURIE]]] = None, + config: Optional[DisjointnessInducerConfig] = None, +) -> Iterator[Tuple[List[PRED_CURIE], CURIE, CURIE, int, int]]: + """ + Generate disjointness axioms for a set of roots. + + :param adapter: + :param roots: + :return: + """ + cache = {} + if config is None: + config = DisjointnessInducerConfig() + if predicate_sets is None: + predicate_sets = [[IS_A]] + if not roots: + roots = [] + for predicates in predicate_sets: + roots.extend(adapter.roots(predicates)) + stack = list(set(roots)) + logging.info(f"ROOTS: {stack}") + visited = set() + while len(stack) > 0: + node = stack.pop() + if node in visited: + continue + visited.add(node) + for predicates in predicate_sets: + children = [r[0] for r in adapter.relationships(objects=[node], predicates=predicates)] + children = { + c + for c in children + if len(_descendants(adapter, c, predicates=predicates, cache=cache)) + > config.min_descendants + } + logging.info(f"N: {node} CHILDREN {predicates}: {children}") + for i1, c1 in enumerate(children): + for i2, c2 in enumerate(children): + if i2 >= i1: + continue + u, s1, s2 = underlap(adapter, c1, c2, predicates, cache=cache) + if u > 0: + continue + if s1 < config.min_descendants or s2 < config.min_descendants: + continue + logging.debug(f"UNDERLAP: {c1} {c2} {u} {s1} {s2}") + yield predicates, c1, c2, s1, s2 + stack.append(c1) + if config.genus_terms_are_candidates: + ldas = adapter.logical_definitions() + genus_ids = set() + for lda in ldas: + genus_ids.update(lda.genusIds) + for i1, c1 in enumerate(genus_ids): + for i2, c2 in enumerate(genus_ids): + if i2 >= i1: + continue + u, s1, s2 = underlap(adapter, c1, c2, [IS_A]) + if u > 0: + continue + if s1 < config.min_descendants or s2 < config.min_descendants: + continue + yield [IS_A], c1, c2, s1, s2 + + +def equivalent(dxa1: DisjointClassExpressionsAxiom, dxa2: DisjointClassExpressionsAxiom) -> bool: + """ + Determine if two disjointness axioms are equivalent. + + :param dxa1: + :param dxa2: + :return: + """ + exprs1 = {str(x) for x in dxa1.classExpressions} + exprs2 = {str(x) for x in dxa2.classExpressions} + return set(dxa1.classIds) == set(dxa2.classIds) and exprs1 == exprs2 + + +def subsumed_by( + adapter: OboGraphInterface, + dxa: DisjointClassExpressionsAxiom, + existing: List[DisjointClassExpressionsAxiom], +) -> Iterator[DisjointClassExpressionsAxiom]: + for other_dxa in existing: + this_class_ids = set(other_dxa.classIds) + this_class_expressions = set( + [(x.propertyId, x.fillerId) for x in other_dxa.classExpressions] + ) + if equivalent(dxa, other_dxa): + yield other_dxa + continue + for c in dxa.classIds: + ancs = list(adapter.ancestors(c, [IS_A])) + this_class_ids = this_class_ids.difference(ancs) + for cx in dxa.classExpressions: + ancs = list(adapter.ancestors(cx.fillerId, [IS_A, cx.propertyId])) + ancxs = [(cx.propertyId, a) for a in ancs] + this_class_expressions = this_class_expressions.difference(ancxs) + if len(this_class_ids) == 0 and len(this_class_expressions) == 0: + yield other_dxa + + +def generate_disjoint_class_expressions_axioms( + adapter: OboGraphInterface, + roots: Optional[List[CURIE]] = None, + predicate_sets: Optional[List[List[PRED_CURIE]]] = None, + config: Optional[DisjointnessInducerConfig] = None, +) -> Iterator[DisjointClassExpressionsAxiom]: + """ + Generate disjointness axioms for a set of roots. + + :param adapter: + :param roots: + :return: + """ + if config is None: + config = DisjointnessInducerConfig() + if config.exclude_existing: + existing = list(adapter.disjoint_class_expressions_axioms()) + else: + existing = [] + for predicates, c1, c2, _s1, _s2 in generate_underlaps(adapter, roots, predicate_sets, config): + non_is_a = [p for p in predicates if p != IS_A] + if not non_is_a: + yield DisjointClassExpressionsAxiom( + classIds=[c1, c2], + ) + else: + if len(non_is_a) > 1: + raise NotImplementedError(f"Cannot handle multiple non-is_a predicates: {non_is_a}") + p = non_is_a[0] + dxa = DisjointClassExpressionsAxiom( + classExpressions=[ + ExistentialRestrictionExpression( + propertyId=p, + fillerId=c1, + ), + ExistentialRestrictionExpression( + propertyId=p, + fillerId=c2, + ), + ] + ) + logging.debug(f"Checking candidate: {dxa} against existing: {len(existing)}") + for e in existing: + if equivalent(e, dxa): + break + subsumers = list(subsumed_by(adapter, dxa, existing)) + if len(subsumers) > 0: + break + else: + yield dxa + existing.append(dxa) diff --git a/tests/test_utilities/test_disjointness_analysis.py b/tests/test_utilities/test_disjointness_analysis.py new file mode 100644 index 000000000..f882b46be --- /dev/null +++ b/tests/test_utilities/test_disjointness_analysis.py @@ -0,0 +1,94 @@ +import unittest + +from oaklib import get_adapter +from oaklib.datamodels.obograph import ( + DisjointClassExpressionsAxiom, + ExistentialRestrictionExpression, +) +from oaklib.datamodels.vocabulary import PART_OF +from oaklib.utilities.axioms.disjointness_axiom_analyzer import ( + generate_disjoint_class_expressions_axioms, + subsumed_by, +) +from tests import ( + FUNGI, + INPUT_DIR, + MEMBRANE, + NUCLEAR_MEMBRANE, + NUCLEUS, + ORGANELLE, + VACUOLE, +) + + +class TestDisjointnessAnalyzer(unittest.TestCase): + def setUp(self) -> None: + self.adapter = get_adapter(INPUT_DIR / "go-nucleus.db") + + def test_generate_axioms(self): + found = False + for ax in generate_disjoint_class_expressions_axioms(self.adapter): + if len(ax.classIds) > 1: + assert ax.classIds[0] != ax.classIds[1] + if set(ax.classIds) == {ORGANELLE, MEMBRANE}: + found = True + assert found + + def test_subsumed_by(self): + cases = [ + ((ORGANELLE, MEMBRANE), [], 0, "edge case, no existing axioms"), + ((ORGANELLE, MEMBRANE), [(ORGANELLE, MEMBRANE)], 1, "simple case, identical axioms"), + ( + (ORGANELLE, MEMBRANE), + [(MEMBRANE, ORGANELLE)], + 1, + "simple case, identical axioms, different structure", + ), + ((ORGANELLE, MEMBRANE), [(ORGANELLE, FUNGI)], 0, "simple case, different axioms"), + ((ORGANELLE, NUCLEAR_MEMBRANE), [(ORGANELLE, MEMBRANE)], 1, "subsumption on one side"), + ((NUCLEUS, NUCLEAR_MEMBRANE), [(ORGANELLE, MEMBRANE)], 1, "subsumption on both sides"), + ((ORGANELLE, MEMBRANE), [(NUCLEUS, NUCLEAR_MEMBRANE)], 0, "inverted"), + ] + for asserted_pair, existing_pairs, expected, _info in cases: + asserted_ax = DisjointClassExpressionsAxiom(classIds=list(asserted_pair)) + existing_axioms = [ + DisjointClassExpressionsAxiom(classIds=list(x)) for x in existing_pairs + ] + subsumers = list(subsumed_by(self.adapter, asserted_ax, existing_axioms)) + assert len(subsumers) == expected + + def test_subsumed_by_expr(self): + cases = [ + ((ORGANELLE, MEMBRANE), [], 0, "edge case, no existing axioms"), + ((ORGANELLE, MEMBRANE), [(ORGANELLE, MEMBRANE)], 1, "simple case, identical axioms"), + ( + (ORGANELLE, MEMBRANE), + [(MEMBRANE, ORGANELLE)], + 1, + "simple case, identical axioms, different structure", + ), + ((ORGANELLE, MEMBRANE), [(ORGANELLE, FUNGI)], 0, "simple case, different axioms"), + ((ORGANELLE, NUCLEAR_MEMBRANE), [(ORGANELLE, MEMBRANE)], 1, "subsumption on one side"), + ( + (NUCLEAR_MEMBRANE, VACUOLE), + [(NUCLEUS, VACUOLE)], + 1, + "part subsumption on both sides", + ), + ((NUCLEUS, VACUOLE), [(NUCLEAR_MEMBRANE, VACUOLE)], 0, "inverted"), + ] + + def _as_expr(c: str) -> ExistentialRestrictionExpression: + return ExistentialRestrictionExpression(propertyId=PART_OF, fillerId=c) + + for asserted_pair, existing_pairs, expected, info in cases: + asserted_ax = DisjointClassExpressionsAxiom( + classExpressions=[_as_expr(x) for x in asserted_pair] + ) + existing_axioms = [] + for pair in existing_pairs: + existing_axioms.append( + DisjointClassExpressionsAxiom(classExpressions=[_as_expr(x) for x in pair]) + ) + subsumers = list(subsumed_by(self.adapter, asserted_ax, existing_axioms)) + self.assertEqual(expected, len(subsumers), info)