Skip to content

Commit

Permalink
Merge pull request #6 from obophenotype/test
Browse files Browse the repository at this point in the history
Dockerfile and docker-compose file have been added.
  • Loading branch information
ubyndr authored Apr 3, 2024
2 parents 8640dad + 463ea9f commit 8a1550f
Show file tree
Hide file tree
Showing 8 changed files with 330 additions and 0 deletions.
61 changes: 61 additions & 0 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Docker

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

on:
workflow_dispatch:
release:
types: [created]

env:
# Use docker.io for Docker Hub if empty
REGISTRY: ghcr.io
# github.repository as <account>/<repo>
IMAGE_NAME: ${{ github.repository }}


jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

- name: Setup environment
run: echo "BRANCH=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1

- name: Build and push Docker image
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
19 changes: 19 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM python:3.9

SHELL ["/bin/bash", "-c"]

RUN apt-get update && apt-get install -y build-essential graphviz libgraphviz-dev pkg-config && apt-get clean && rm -rf /var/lib/apt/lists/*

WORKDIR /app

ENV VENV="/opt/env"
ENV PATH="$VENV/bin:$PATH"

RUN python -m venv $VENV

COPY requirements.txt ./
RUN pip install -r requirements.txt

COPY src/ ./src

CMD ["python", "src/process.py"]
11 changes: 11 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: '3.8'

services:
anndata2rdf:
image: anndata2rdf:latest
container_name: anndata_to_rdf
volumes:
- ./src/config:/app/src/config
- ./src/curated_data:/app/src/curated_data
- ./src/dataset:/app/src/dataset
- ./src/graph:/app/src/graph
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cellxgene-census==1.11.1
pandasaurus-cxg~=0.1.11
pandas~=2.2.1
PyYAML~=6.0.1
60 changes: 60 additions & 0 deletions src/csv_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import logging
import os

import pandas as pd
import yaml


logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def generate_yaml_data(data):
grouped_data = data.groupby("CxG link")
_yaml_data = []
for link, group_df in grouped_data:
author_cell_type_list = [
col.strip()
for col in group_df["Author Category Cell Type Field Name"].tolist()
]
entry = {"CxG_link": link, "author_cell_type_list": author_cell_type_list}
_yaml_data.append(entry)
return _yaml_data


def write_yaml_file(yaml_data, file_path):
with open(file_path, "w") as yaml_file:
yaml.dump(yaml_data, yaml_file)
logger.info(f"{file_path} written")


def generate_author_cell_type_config(curated_data_folder: str = "curated_data"):
all_yaml_data = []
data_folder = os.path.join(
os.path.dirname(os.path.abspath(__file__)), curated_data_folder
)
for file_name in os.listdir(data_folder):
file_path = os.path.join(data_folder, file_name)

if file_name.endswith(".csv"):
df = pd.read_csv(file_path)
elif file_name.endswith(".xlsx") or file_name.endswith(".xls"):
df = pd.read_excel(file_path)
else:
logger.info(f"Skipping file '{file_name}' with unsupported format.")
continue

yaml_data = generate_yaml_data(df)
all_yaml_data.extend(yaml_data)
return all_yaml_data


if __name__ == "__main__":
config_yaml = generate_author_cell_type_config()
output_file_path = os.path.join(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "config"),
"cxg_author_cell_type.yaml",
)
write_yaml_file(config_yaml, output_file_path)
53 changes: 53 additions & 0 deletions src/generate_rdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
import os
from typing import List
import yaml

from pandasaurus_cxg.enrichment_analysis import AnndataEnrichmentAnalyzer
from pandasaurus_cxg.graph_generator.graph_generator import GraphGenerator

logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def generate_rdf_graph(
anndata_file_path: str, author_cell_type_list: List[str], output_rdf_path: str
):
logger.info(f"Generating RDF graph using {anndata_file_path}...")
aea = AnndataEnrichmentAnalyzer(anndata_file_path, author_cell_type_list)
aea.analyzer_manager.co_annotation_report()
gg = GraphGenerator(aea)
gg.generate_rdf_graph()
gg.set_label_adding_priority(author_cell_type_list)
gg.add_label_to_terms()
gg.save_rdf_graph(file_name=output_rdf_path)
logger.info(f"RDF graph has been generated for {anndata_file_path}...")


if __name__ == "__main__":
dirname = os.path.dirname(os.path.abspath(__file__))
config_dir = os.path.join(dirname, "config")
with open(
os.path.join(
config_dir,
"rdf_config.yaml",
),
"r",
) as file:
config_data = yaml.safe_load(file)

for config in config_data:
generate_rdf_graph(
os.path.join(dirname, str(config["anndata_file_path"])),
config["author_cell_type_list"],
os.path.join(
"graph",
(
config["output_rdf_path"]
if "output_rdf_path" in config
else config["anndata_file_path"].split("/")[-1].split(".")[0]
),
),
)
43 changes: 43 additions & 0 deletions src/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import logging
import os
import sys

from csv_parser import generate_author_cell_type_config, write_yaml_file
from pull_anndata import download_dataset_with_id, get_dataset_dict, delete_file
from generate_rdf import generate_rdf_graph

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.WARNING)
stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stdout_handler.setFormatter(formatter)
logger.addHandler(stdout_handler)

CONFIG_DIRECTORY = "config"
CURATED_DATA_DIRECTORY = "curated_data"
DATASET_DIRECTORY = "dataset"
GRAPH_DIRECTORY = "graph"

CXG_AUTHOR_CELL_TYPE_CONFIG = "cxg_author_cell_type.yaml"
GENERATE_RDF_CONFIG = "generate_rdf_config.yaml"

cxg_author_cell_type_yaml = generate_author_cell_type_config()
output_file_path = os.path.join(
os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG_DIRECTORY),
CXG_AUTHOR_CELL_TYPE_CONFIG,
)
write_yaml_file(cxg_author_cell_type_yaml, output_file_path)

datasets = get_dataset_dict(cxg_author_cell_type_yaml)
for dataset, author_cell_types in datasets.items():
dataset_path = download_dataset_with_id(dataset)
generate_rdf_graph(
dataset_path,
author_cell_types,
os.path.join(
os.path.join(os.path.dirname(os.path.abspath(__file__)), GRAPH_DIRECTORY),
dataset,
),
)
delete_file(dataset_path)
79 changes: 79 additions & 0 deletions src/pull_anndata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import logging
import os
from typing import Dict, List, Optional, Union
import yaml

import cellxgene_census


logging.basicConfig(level=logging.WARNING)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def download_dataset_with_id(dataset_id: str, file_path: Optional[str] = None) -> str:
"""
Download an AnnData dataset with the specified ID.
Args:
dataset_id (str): The ID of the dataset to download.
file_path (Optional[str], optional): The file path to save the downloaded AnnData. If not provided,
the dataset_id will be used as the file name. Defaults to None.
Returns:
str: The path to the downloaded file
"""
anndata_file_path = f"{dataset_id}.h5ad" if file_path is None else file_path
anndata_file_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
os.path.join("dataset", anndata_file_path),
)
if os.path.exists(anndata_file_path):
logger.info(f"File '{anndata_file_path}' already exists. Skipping download.")
else:
logger.info(f"Downloading dataset with ID '{dataset_id} to {anndata_file_path}'...")
cellxgene_census.download_source_h5ad(dataset_id, to_path=anndata_file_path)
logger.info(f"Download complete. File saved at '{anndata_file_path}'.")
return anndata_file_path


def delete_file(file_name):
try:
os.remove(file_name)
logger.info(f"File '{file_name}' deleted successfully.")
except OSError as e:
logger.info(f"Error deleting file '{file_name}': {e}")


def get_dataset_dict(input_source: List[Dict]):
cxg_dataset_dict = {}
for config in input_source:
cxg_link = config["CxG_link"]
cxg_id = get_dataset_id_from_link(cxg_link)
cxg_dataset_dict.update({cxg_id.split(".")[0]: config["author_cell_type_list"]})
return cxg_dataset_dict


def get_dataset_id_from_link(cxg_link: str) -> str:
if cxg_link.endswith("/"):
return cxg_link.split("/")[-2]
else:
return cxg_link.split("/")[-1]


def read_yaml_config(config_file: str):
with open(config_file, "r") as file:
return yaml.safe_load(file)


if __name__ == "__main__":
config_list = read_yaml_config(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
os.path.join("config", "cxg_author_cell_type.yaml"),
)
)
datasets = get_dataset_dict(config_list)
for dataset in datasets.keys():
dataset_name = download_dataset_with_id(dataset)

0 comments on commit 8a1550f

Please sign in to comment.