Merge pull request #9 from Cellular-Semantics/obask-refactoring

Obask refactoring
Cellular-Semantics · May 21, 2024 · 39c725f · 39c725f
2 parents 88deae5 + e2e6f65
commit 39c725f
Show file tree

Hide file tree

Showing 65 changed files with 1,710 additions and 39 deletions.
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -54,7 +54,7 @@ jobs:
       - name: Build and push Docker image
         uses: docker/[email protected]
         with:
-          context: .
+          context: "./anndata2rdf/"
           push: true
           platforms: linux/amd64, linux/arm64
           tags: ${{ steps.meta.outputs.tags }}

diff --git a/.github/workflows/schema_validator.yaml b/.github/workflows/schema_validator.yaml
@@ -0,0 +1,39 @@
+name: YAML schema validator
+on:
+  # Triggers the workflow on pull request events but only for the main branch
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'cl_kb_pipeline/config/dumps/neo4j2owl-config.yaml'
+permissions:
+    pull-requests: write
+
+jobs:
+  yaml-schema-validation:
+    runs-on: macos-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v3
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: pip install ruamel.yaml==0.17.21 jsonschema==4.4.0
+    - name: Schema validation
+      id: schema
+      run: |
+        python cl_kb_pipeline/src/test_neo2owl_config.py
+    - name: Prepare schema validator comment
+      if: failure()
+      run: |
+        echo "cl_kb_pipeline/config/dumps/neo4j2owl-config.yaml file failed the schema validation check   " > comment.md; cat validation.report >> comment.md
+    - name: Prepare success comment
+      run: |
+        echo "cl_kb_pipeline/config/dumps/neo4j2owl-config.yaml file passed validation check   " > comment.md
+    - name: Post comment validator comment
+      if: always()
+      env:
+        GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+      uses: NejcZdovc/[email protected]
+      with:
+        file: "../../comment.md"
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
+src/__pycache__/
 *.py[cod]
 *$py.class
 
@@ -158,3 +159,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+**/.DS_Store
diff --git a/Dockerfile b/Dockerfile
diff --git a/anndata2rdf/Dockerfile b/anndata2rdf/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.10
+
+SHELL ["/bin/bash", "-c"]
+
+RUN apt-get update && apt-get install -y build-essential graphviz libgraphviz-dev pkg-config libhdf5-dev && apt-get clean && rm \
+    -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY requirements.txt ./
+RUN pip3 install --upgrade pip
+RUN pip3 install -r requirements.txt
+
+RUN mkdir -p src/config src/curated_data src/dataset src/graph
+
+COPY src/csv_parser.py ./src
+COPY src/pull_anndata.py ./src
+COPY src/generate_rdf.py ./src
+COPY src/process.py ./src
+
+CMD ["python", "src/process.py"]
diff --git a/docker-compose.yml → anndata2rdf/docker-compose.yml b/docker-compose.yml → anndata2rdf/docker-compose.yml
@@ -2,10 +2,12 @@ version: '3.8'
 
 services:
   anndata2rdf:
-    image: anndata2rdf:latest
+    image: ghcr.io/obophenotype/cl_kb:main@sha256:26384c8de0416e3d9888407d73472d12709d0924cd509081f74b9b5e17e634cf
     container_name: anndata_to_rdf
     volumes:
       - ./src/config:/app/src/config
       - ./src/curated_data:/app/src/curated_data
       - ./src/dataset:/app/src/dataset
-      - ./src/graph:/app/src/graph
+      - obask_data:/app/src/graph
+volumes:
+  obask_data:
diff --git a/anndata2rdf/requirements.txt b/anndata2rdf/requirements.txt
@@ -0,0 +1,3 @@
+pandasaurus-cxg
+pandas
+PyYAML~=6.0.1
diff --git a/anndata2rdf/src/config/README.md b/anndata2rdf/src/config/README.md
@@ -0,0 +1,3 @@
+### `config`
+Contains YAML files generated from the CSV files in the `curated_data` directory. These configurations are used to 
+guide the download process of datasets from CxG.
diff --git a/src/csv_parser.py → anndata2rdf/src/csv_parser.py b/src/csv_parser.py → anndata2rdf/src/csv_parser.py
@@ -12,7 +12,8 @@
 
 
 def generate_yaml_data(data):
-    grouped_data = data.groupby("CxG link")
+    filtered_df = data[data["Content"] == "cell types"]
+    grouped_data = filtered_df.groupby("h5ad link")
     _yaml_data = []
     for link, group_df in grouped_data:
         author_cell_type_list = [

diff --git a/...src/curated_data/CxG author category field names - Brain_CxG_Author_Category_Filtered.csv b/...src/curated_data/CxG author category field names - Brain_CxG_Author_Category_Filtered.csv
diff --git a/anndata2rdf/src/curated_data/README.md b/anndata2rdf/src/curated_data/README.md
@@ -0,0 +1,2 @@
+### `curated_data` 
+This directory holds the original CSV files that are used as the starting point of the data processing pipeline. These files are read and processed to generate corresponding YAML configurations.
diff --git a/anndata2rdf/src/dataset/README.md b/anndata2rdf/src/dataset/README.md
@@ -0,0 +1,2 @@
+### `dataset`
+Stores the datasets downloaded according to the instructions specified in the YAML files located in the `config` directory. These datasets are then used for further processing and analysis.
diff --git a/src/generate_rdf.py → anndata2rdf/src/generate_rdf.py b/src/generate_rdf.py → anndata2rdf/src/generate_rdf.py
@@ -32,7 +32,7 @@ def generate_rdf_graph(
     with open(
         os.path.join(
             config_dir,
-            "rdf_config.yaml",
+            "cxg_author_cell_type.yaml",
         ),
         "r",
     ) as file:

diff --git a/anndata2rdf/src/graph/README.md b/anndata2rdf/src/graph/README.md
@@ -0,0 +1,2 @@
+### `graph`
+Contains the OWL (Web Ontology Language) files that are generated from the datasets in the `dataset` directory. These files represent the structured data in a format that is suitable for semantic web applications.
diff --git a/src/process.py → anndata2rdf/src/process.py b/src/process.py → anndata2rdf/src/process.py
@@ -3,14 +3,19 @@
 import sys
 
 from csv_parser import generate_author_cell_type_config, write_yaml_file
-from pull_anndata import download_dataset_with_id, get_dataset_dict, delete_file
+from pull_anndata import (
+    get_dataset_dict,
+    delete_file,
+    download_dataset_with_url,
+    get_dataset_id_from_h5ad_link,
+)
 from generate_rdf import generate_rdf_graph
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.WARNING)
 stdout_handler = logging.StreamHandler(sys.stdout)
 stdout_handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 stdout_handler.setFormatter(formatter)
 logger.addHandler(stdout_handler)
 
@@ -28,16 +33,15 @@
     CXG_AUTHOR_CELL_TYPE_CONFIG,
 )
 write_yaml_file(cxg_author_cell_type_yaml, output_file_path)
-
 datasets = get_dataset_dict(cxg_author_cell_type_yaml)
 for dataset, author_cell_types in datasets.items():
-    dataset_path = download_dataset_with_id(dataset)
+    dataset_path = download_dataset_with_url(dataset)
     generate_rdf_graph(
         dataset_path,
         author_cell_types,
         os.path.join(
             os.path.join(os.path.dirname(os.path.abspath(__file__)), GRAPH_DIRECTORY),
-            dataset,
+            get_dataset_id_from_h5ad_link(dataset),
         ),
     )
     delete_file(dataset_path)
diff --git a/src/pull_anndata.py → anndata2rdf/src/pull_anndata.py b/src/pull_anndata.py → anndata2rdf/src/pull_anndata.py
@@ -1,9 +1,9 @@
 import logging
 import os
 from typing import Dict, List, Optional, Union
-import yaml
 
-import cellxgene_census
+import requests
+import yaml
 
 
 logging.basicConfig(level=logging.WARNING)
@@ -32,12 +32,56 @@ def download_dataset_with_id(dataset_id: str, file_path: Optional[str] = None) -
     if os.path.exists(anndata_file_path):
         logger.info(f"File '{anndata_file_path}' already exists. Skipping download.")
     else:
-        logger.info(f"Downloading dataset with ID '{dataset_id} to {anndata_file_path}'...")
+        logger.info(
+            f"Downloading dataset with ID '{dataset_id} to {anndata_file_path}'..."
+        )
         cellxgene_census.download_source_h5ad(dataset_id, to_path=anndata_file_path)
         logger.info(f"Download complete. File saved at '{anndata_file_path}'.")
     return anndata_file_path
 
 
+def download_dataset_with_url(dataset_url: str, file_path: Optional[str] = None) -> str:
+    """
+    Download an AnnData dataset with the specified url.
+
+    Args:
+        dataset_url (str): The url of the dataset to download.
+        file_path (Optional[str], optional): The file path to save the downloaded AnnData. If not provided,
+            the dataset_id will be used as the file name. Defaults to None.
+
+    Returns:
+        str: The path to the downloaded file
+    """
+
+    anndata_file_path = (
+        f"{get_dataset_id_from_h5ad_link(dataset_url)}.h5ad"
+        if file_path is None
+        else file_path
+    )
+    anndata_file_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        os.path.join("dataset", anndata_file_path),
+    )
+    if os.path.exists(anndata_file_path):
+        logger.info(f"File '{anndata_file_path}' already exists. Skipping download.")
+    else:
+        logger.info(
+            f"Downloading dataset with URL '{dataset_url} to {anndata_file_path}'..."
+        )
+        response = requests.get(dataset_url)
+        if response.status_code == 200:
+            with open(anndata_file_path, "wb") as f:
+                f.write(response.content)
+            logger.info(f"Download complete. File saved at '{anndata_file_path}'.")
+        else:
+            logger.info(f"Failed to download the dataset with URL '{dataset_url}'...")
+    return anndata_file_path
+
+
+def get_dataset_id_from_h5ad_link(dataset_url):
+    return dataset_url.split("/")[-1].split(".")[0]
+
+
 def delete_file(file_name):
     try:
         os.remove(file_name)
@@ -50,8 +94,13 @@ def get_dataset_dict(input_source: List[Dict]):
     cxg_dataset_dict = {}
     for config in input_source:
         cxg_link = config["CxG_link"]
-        cxg_id = get_dataset_id_from_link(cxg_link)
-        cxg_dataset_dict.update({cxg_id.split(".")[0]: config["author_cell_type_list"]})
+        if cxg_link.endswith(".cxg"):
+            cxg_id = get_dataset_id_from_link(cxg_link)
+            cxg_dataset_dict.update(
+                {cxg_id.split(".")[0]: config["author_cell_type_list"]}
+            )
+        else:
+            cxg_dataset_dict.update({cxg_link: config["author_cell_type_list"]})
     return cxg_dataset_dict
 
 
@@ -76,4 +125,4 @@ def read_yaml_config(config_file: str):
     )
     datasets = get_dataset_dict(config_list)
     for dataset in datasets.keys():
-        dataset_name = download_dataset_with_id(dataset)
+        dataset_name = download_dataset_with_url(dataset)
diff --git a/cl_kb_pipeline/README.md b/cl_kb_pipeline/README.md
@@ -0,0 +1,5 @@
+# cl_kb_pipeline
+
+[OBASK pipeline](https://github.com/OBASKTools/obask) for cl_kb_pipeline.
+
+To run the pipeline, please follow the `Run your project` steps of the [OBASK quick start guide](https://obasktools.github.io/obask/quick_start/)
diff --git a/cl_kb_pipeline/config/collectdata/config.env b/cl_kb_pipeline/config/collectdata/config.env
@@ -0,0 +1,2 @@
+EXPORT_KB_TO_OWL=false
+COLLECT_BIBLIO_DATA=false
diff --git a/cl_kb_pipeline/config/collectdata/sparql/delete_blocked_entities.ru b/cl_kb_pipeline/config/collectdata/sparql/delete_blocked_entities.ru
@@ -0,0 +1,18 @@
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX n2o: <http://n2o.neo/property/>
+PREFIX n2oc: <http://n2o.neo/custom/>
+PREFIX dct: <http://purl.org/dc/terms/>
+
+DELETE {
+  ?s <http://n2o.neo/custom/block> ?blocked .
+  ?s ?p ?o .
+}
+WHERE {
+  ?s <http://n2o.neo/custom/block> ?blocked .
+  ?s ?p ?o .
+  FILTER(?blocked=true) .
+  FILTER(isIRI(?s))
+}
+
+### EDIT: this was obsoleted in the end in favour of a cypher solution, see process.sh.
diff --git a/cl_kb_pipeline/config/collectdata/sparql/delete_blocked_relations.ru b/cl_kb_pipeline/config/collectdata/sparql/delete_blocked_relations.ru
@@ -0,0 +1,30 @@
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX n2o: <http://n2o.neo/property/>
+PREFIX n2oc: <http://n2o.neo/custom/>
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+
+DELETE {
+  ?s ?p ?o .
+  ?r rdf:type owl:Axiom ;
+        owl:annotatedSource ?s ;
+        owl:annotatedProperty ?p ;
+        owl:annotatedTarget ?o ;
+        <http://n2o.neo/custom/block> ?blocked;
+        ?bp ?bo;
+
+} WHERE {
+  ?s ?p ?o .
+  ?r rdf:type owl:Axiom ;
+      owl:annotatedSource ?s ;
+      owl:annotatedProperty ?p ;
+      owl:annotatedTarget ?o ;
+      <http://n2o.neo/custom/block> ?blocked;
+      ?bp ?bo;
+
+  FILTER(?blocked=true) .
+}
+
+### EDIT: this was obsoleted in the end in favour of a cypher solution, see process.sh.
diff --git a/cl_kb_pipeline/config/collectdata/sparql/delete_embargoed_channels.ru b/cl_kb_pipeline/config/collectdata/sparql/delete_embargoed_channels.ru
@@ -0,0 +1,31 @@
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX n2o: <http://n2o.neo/property/>
+PREFIX n2oc: <http://n2o.neo/custom/>
+PREFIX dct: <http://purl.org/dc/terms/>
+
+#Delete all ds:DataSet where ds.production is False
+#Delete all i:Individual where (ds)-[:has_source]-(i:Individual)<-[:depicts]-(ch:Individual) WHERE ds.production is False
+
+DELETE {
+  ?channel ?channelrel ?channelval .
+}
+
+WHERE {
+
+	?dataset n2o:nodeLabel ?nodelabel . # This selects all datasets
+
+	OPTIONAL {
+		?dataset n2oc:production ?production .
+		# n2oc:production is a bit brittle because IRI might be changed (risk!)
+	}
+
+	?image dct:source ?dataset .
+	?channel <http://xmlns.com/foaf/0.1/depicts> ?image . # There does not always seem to be a channel
+	?channel ?channelrel ?channelval .
+
+	FILTER(?production=false || !bound(?production)) .
+	FILTER(?nodelabel="DataSet") 
+}
+
+### EDIT: this was obsoleted in the end in favour of a ROBOT solution, see process.sh. Using SPARQL this way is too memory consuming.