Skip to content

Commit

Permalink
Merge pull request #9 from Cellular-Semantics/obask-refactoring
Browse files Browse the repository at this point in the history
Obask refactoring
  • Loading branch information
ubyndr authored May 21, 2024
2 parents 88deae5 + e2e6f65 commit 39c725f
Show file tree
Hide file tree
Showing 65 changed files with 1,710 additions and 39 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
- name: Build and push Docker image
uses: docker/[email protected]
with:
context: .
context: "./anndata2rdf/"
push: true
platforms: linux/amd64, linux/arm64
tags: ${{ steps.meta.outputs.tags }}
Expand Down
39 changes: 39 additions & 0 deletions .github/workflows/schema_validator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: YAML schema validator
on:
# Triggers the workflow on pull request events but only for the main branch
pull_request:
branches: [ main ]
paths:
- 'cl_kb_pipeline/config/dumps/neo4j2owl-config.yaml'
permissions:
pull-requests: write

jobs:
yaml-schema-validation:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: 3.8
- name: Install dependencies
run: pip install ruamel.yaml==0.17.21 jsonschema==4.4.0
- name: Schema validation
id: schema
run: |
python cl_kb_pipeline/src/test_neo2owl_config.py
- name: Prepare schema validator comment
if: failure()
run: |
echo "cl_kb_pipeline/config/dumps/neo4j2owl-config.yaml file failed the schema validation check " > comment.md; cat validation.report >> comment.md
- name: Prepare success comment
run: |
echo "cl_kb_pipeline/config/dumps/neo4j2owl-config.yaml file passed validation check " > comment.md
- name: Post comment validator comment
if: always()
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
uses: NejcZdovc/[email protected]
with:
file: "../../comment.md"
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Byte-compiled / optimized / DLL files
__pycache__/
src/__pycache__/
*.py[cod]
*$py.class

Expand Down Expand Up @@ -158,3 +159,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
**/.DS_Store
19 changes: 0 additions & 19 deletions Dockerfile

This file was deleted.

21 changes: 21 additions & 0 deletions anndata2rdf/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10

SHELL ["/bin/bash", "-c"]

RUN apt-get update && apt-get install -y build-essential graphviz libgraphviz-dev pkg-config libhdf5-dev && apt-get clean && rm \
-rf /var/lib/apt/lists/*

WORKDIR /app

COPY requirements.txt ./
RUN pip3 install --upgrade pip
RUN pip3 install -r requirements.txt

RUN mkdir -p src/config src/curated_data src/dataset src/graph

COPY src/csv_parser.py ./src
COPY src/pull_anndata.py ./src
COPY src/generate_rdf.py ./src
COPY src/process.py ./src

CMD ["python", "src/process.py"]
6 changes: 4 additions & 2 deletions docker-compose.yml → anndata2rdf/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ version: '3.8'

services:
anndata2rdf:
image: anndata2rdf:latest
image: ghcr.io/obophenotype/cl_kb:main@sha256:26384c8de0416e3d9888407d73472d12709d0924cd509081f74b9b5e17e634cf
container_name: anndata_to_rdf
volumes:
- ./src/config:/app/src/config
- ./src/curated_data:/app/src/curated_data
- ./src/dataset:/app/src/dataset
- ./src/graph:/app/src/graph
- obask_data:/app/src/graph
volumes:
obask_data:
3 changes: 3 additions & 0 deletions anndata2rdf/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pandasaurus-cxg
pandas
PyYAML~=6.0.1
3 changes: 3 additions & 0 deletions anndata2rdf/src/config/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### `config`
Contains YAML files generated from the CSV files in the `curated_data` directory. These configurations are used to
guide the download process of datasets from CxG.
3 changes: 2 additions & 1 deletion src/csv_parser.py → anndata2rdf/src/csv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@


def generate_yaml_data(data):
grouped_data = data.groupby("CxG link")
filtered_df = data[data["Content"] == "cell types"]
grouped_data = filtered_df.groupby("h5ad link")
_yaml_data = []
for link, group_df in grouped_data:
author_cell_type_list = [
Expand Down

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions anndata2rdf/src/curated_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
### `curated_data`
This directory holds the original CSV files that are used as the starting point of the data processing pipeline. These files are read and processed to generate corresponding YAML configurations.
2 changes: 2 additions & 0 deletions anndata2rdf/src/dataset/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
### `dataset`
Stores the datasets downloaded according to the instructions specified in the YAML files located in the `config` directory. These datasets are then used for further processing and analysis.
2 changes: 1 addition & 1 deletion src/generate_rdf.py → anndata2rdf/src/generate_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def generate_rdf_graph(
with open(
os.path.join(
config_dir,
"rdf_config.yaml",
"cxg_author_cell_type.yaml",
),
"r",
) as file:
Expand Down
2 changes: 2 additions & 0 deletions anndata2rdf/src/graph/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
### `graph`
Contains the OWL (Web Ontology Language) files that are generated from the datasets in the `dataset` directory. These files represent the structured data in a format that is suitable for semantic web applications.
14 changes: 9 additions & 5 deletions src/process.py → anndata2rdf/src/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,19 @@
import sys

from csv_parser import generate_author_cell_type_config, write_yaml_file
from pull_anndata import download_dataset_with_id, get_dataset_dict, delete_file
from pull_anndata import (
get_dataset_dict,
delete_file,
download_dataset_with_url,
get_dataset_id_from_h5ad_link,
)
from generate_rdf import generate_rdf_graph

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.WARNING)
stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
stdout_handler.setFormatter(formatter)
logger.addHandler(stdout_handler)

Expand All @@ -28,16 +33,15 @@
CXG_AUTHOR_CELL_TYPE_CONFIG,
)
write_yaml_file(cxg_author_cell_type_yaml, output_file_path)

datasets = get_dataset_dict(cxg_author_cell_type_yaml)
for dataset, author_cell_types in datasets.items():
dataset_path = download_dataset_with_id(dataset)
dataset_path = download_dataset_with_url(dataset)
generate_rdf_graph(
dataset_path,
author_cell_types,
os.path.join(
os.path.join(os.path.dirname(os.path.abspath(__file__)), GRAPH_DIRECTORY),
dataset,
get_dataset_id_from_h5ad_link(dataset),
),
)
delete_file(dataset_path)
61 changes: 55 additions & 6 deletions src/pull_anndata.py → anndata2rdf/src/pull_anndata.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import logging
import os
from typing import Dict, List, Optional, Union
import yaml

import cellxgene_census
import requests
import yaml


logging.basicConfig(level=logging.WARNING)
Expand Down Expand Up @@ -32,12 +32,56 @@ def download_dataset_with_id(dataset_id: str, file_path: Optional[str] = None) -
if os.path.exists(anndata_file_path):
logger.info(f"File '{anndata_file_path}' already exists. Skipping download.")
else:
logger.info(f"Downloading dataset with ID '{dataset_id} to {anndata_file_path}'...")
logger.info(
f"Downloading dataset with ID '{dataset_id} to {anndata_file_path}'..."
)
cellxgene_census.download_source_h5ad(dataset_id, to_path=anndata_file_path)
logger.info(f"Download complete. File saved at '{anndata_file_path}'.")
return anndata_file_path


def download_dataset_with_url(dataset_url: str, file_path: Optional[str] = None) -> str:
"""
Download an AnnData dataset with the specified url.
Args:
dataset_url (str): The url of the dataset to download.
file_path (Optional[str], optional): The file path to save the downloaded AnnData. If not provided,
the dataset_id will be used as the file name. Defaults to None.
Returns:
str: The path to the downloaded file
"""

anndata_file_path = (
f"{get_dataset_id_from_h5ad_link(dataset_url)}.h5ad"
if file_path is None
else file_path
)
anndata_file_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
os.path.join("dataset", anndata_file_path),
)
if os.path.exists(anndata_file_path):
logger.info(f"File '{anndata_file_path}' already exists. Skipping download.")
else:
logger.info(
f"Downloading dataset with URL '{dataset_url} to {anndata_file_path}'..."
)
response = requests.get(dataset_url)
if response.status_code == 200:
with open(anndata_file_path, "wb") as f:
f.write(response.content)
logger.info(f"Download complete. File saved at '{anndata_file_path}'.")
else:
logger.info(f"Failed to download the dataset with URL '{dataset_url}'...")
return anndata_file_path


def get_dataset_id_from_h5ad_link(dataset_url):
return dataset_url.split("/")[-1].split(".")[0]


def delete_file(file_name):
try:
os.remove(file_name)
Expand All @@ -50,8 +94,13 @@ def get_dataset_dict(input_source: List[Dict]):
cxg_dataset_dict = {}
for config in input_source:
cxg_link = config["CxG_link"]
cxg_id = get_dataset_id_from_link(cxg_link)
cxg_dataset_dict.update({cxg_id.split(".")[0]: config["author_cell_type_list"]})
if cxg_link.endswith(".cxg"):
cxg_id = get_dataset_id_from_link(cxg_link)
cxg_dataset_dict.update(
{cxg_id.split(".")[0]: config["author_cell_type_list"]}
)
else:
cxg_dataset_dict.update({cxg_link: config["author_cell_type_list"]})
return cxg_dataset_dict


Expand All @@ -76,4 +125,4 @@ def read_yaml_config(config_file: str):
)
datasets = get_dataset_dict(config_list)
for dataset in datasets.keys():
dataset_name = download_dataset_with_id(dataset)
dataset_name = download_dataset_with_url(dataset)
5 changes: 5 additions & 0 deletions cl_kb_pipeline/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# cl_kb_pipeline

[OBASK pipeline](https://github.com/OBASKTools/obask) for cl_kb_pipeline.

To run the pipeline, please follow the `Run your project` steps of the [OBASK quick start guide](https://obasktools.github.io/obask/quick_start/)
2 changes: 2 additions & 0 deletions cl_kb_pipeline/config/collectdata/config.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
EXPORT_KB_TO_OWL=false
COLLECT_BIBLIO_DATA=false
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>

DELETE {
?s <http://n2o.neo/custom/block> ?blocked .
?s ?p ?o .
}
WHERE {
?s <http://n2o.neo/custom/block> ?blocked .
?s ?p ?o .
FILTER(?blocked=true) .
FILTER(isIRI(?s))
}

### EDIT: this was obsoleted in the end in favour of a cypher solution, see process.sh.
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


DELETE {
?s ?p ?o .
?r rdf:type owl:Axiom ;
owl:annotatedSource ?s ;
owl:annotatedProperty ?p ;
owl:annotatedTarget ?o ;
<http://n2o.neo/custom/block> ?blocked;
?bp ?bo;

} WHERE {
?s ?p ?o .
?r rdf:type owl:Axiom ;
owl:annotatedSource ?s ;
owl:annotatedProperty ?p ;
owl:annotatedTarget ?o ;
<http://n2o.neo/custom/block> ?blocked;
?bp ?bo;

FILTER(?blocked=true) .
}

### EDIT: this was obsoleted in the end in favour of a cypher solution, see process.sh.
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX n2o: <http://n2o.neo/property/>
PREFIX n2oc: <http://n2o.neo/custom/>
PREFIX dct: <http://purl.org/dc/terms/>

#Delete all ds:DataSet where ds.production is False
#Delete all i:Individual where (ds)-[:has_source]-(i:Individual)<-[:depicts]-(ch:Individual) WHERE ds.production is False

DELETE {
?channel ?channelrel ?channelval .
}

WHERE {

?dataset n2o:nodeLabel ?nodelabel . # This selects all datasets

OPTIONAL {
?dataset n2oc:production ?production .
# n2oc:production is a bit brittle because IRI might be changed (risk!)
}

?image dct:source ?dataset .
?channel <http://xmlns.com/foaf/0.1/depicts> ?image . # There does not always seem to be a channel
?channel ?channelrel ?channelval .

FILTER(?production=false || !bound(?production)) .
FILTER(?nodelabel="DataSet")
}

### EDIT: this was obsoleted in the end in favour of a ROBOT solution, see process.sh. Using SPARQL this way is too memory consuming.
Loading

0 comments on commit 39c725f

Please sign in to comment.