Skip to content

Commit

Permalink
Merge pull request #209 from Knowledge-Graph-Hub/refactor_transform_n…
Browse files Browse the repository at this point in the history
…ames

rename transform input and output folders
  • Loading branch information
hrshdhgd authored Aug 26, 2024
2 parents d74c31a + 79ef239 commit a78e1c0
Show file tree
Hide file tree
Showing 38 changed files with 181 additions and 908,162 deletions.
10 changes: 5 additions & 5 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@
url: https://mediadive.dsmz.de/rest/media
local_name: mediadive.json

# Condensed-traits
# Madin et al Condensed-traits
-
url: https://github.com/bacteria-archaea-traits/bacteria-archaea-traits/blob/master/output/condensed_traits_NCBI.csv?raw=true
local_name: traits.csv
local_name: madin_etal.csv

# # # ****Conversion Tables****
#
Expand All @@ -112,7 +112,7 @@
local_name: epm.json

#
# Uniprot
# Uniprot Functional Microbes
#
-
url: https://kghub.io/frozen_incoming_data/uniprot/uniprot_proteomes.tar.gz
Expand Down Expand Up @@ -185,10 +185,10 @@
local_name: disbiome.json

#
# PdMetagenomics
# Wallen et al
#
-
url: https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-022-34667-x/MediaObjects/41467_2022_34667_MOESM4_ESM.xlsx
local_name: PdMetagenomics.xlsx
local_name: wallen_etal.xlsx


2 changes: 1 addition & 1 deletion hpc/run_kg_transform_uniprot.sl
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ module load python/3.10
cd /global/cfs/cdirs/m4689/master/kg-microbe
source venv/bin/activate
git checkout master
poetry run kg transform -s UniprotTransform
poetry run kg transform -s UniprotFunctionalMicrobesTransform
24 changes: 12 additions & 12 deletions hpc/run_parallel_transform.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#SBATCH --array=0-10
#SBATCH -N 1

export ROBOT_JAVA_ARGS="-Xmx64h -XX:+UseG1GC"
export ROBOT_JAVA_ARGS="-Xmx64g -XX:+UseG1GC"
export JAVA_OPT="-Xmx64g -XX:+UseG1GC"

module use /global/common/software/m4689/public/modulefiles
Expand All @@ -22,17 +22,17 @@ cd kg-microbe

# Array of transform names
transforms=(
OntologyTransform
BacDiveTransform
MediaDiveTransform
TraitsTransform
RheaMappingsTransform
BactoTraitsTransform
UniprotHumanTransform
CtdTransform
DisbiomeTransform
PdMetagenomicsTransform
UniprotTransform
ontologies
bacdive
mediadive
madin_etal
rheamappings
bactotraits
uniprot_human
ctd
disbiome
wallen_etal
uniprot_functional_microbes
)

# Get the transform for this job array task
Expand Down
56 changes: 37 additions & 19 deletions kg_microbe/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,33 @@

from kg_microbe.transform_utils.bacdive.bacdive import BacDiveTransform
from kg_microbe.transform_utils.bactotraits.bactotraits import BactoTraitsTransform
from kg_microbe.transform_utils.ctd.ctd import CtdTransform
from kg_microbe.transform_utils.constants import (
BACDIVE,
BACTOTRAITS,
CTD,
DISBIOME,
MADIN_ETAL,
MEDIADIVE,
ONTOLOGIES,
RHEAMAPPINGS,
UNIPROT_FUNCTIONAL_MICROBES,
UNIPROT_HUMAN,
WALLEN_ETAL,
)
from kg_microbe.transform_utils.ctd.ctd import CTDTransform
from kg_microbe.transform_utils.disbiome.disbiome import DisbiomeTransform
from kg_microbe.transform_utils.madin_etal.madin_etal import MadinEtAlTransform
from kg_microbe.transform_utils.mediadive.mediadive import MediaDiveTransform
from kg_microbe.transform_utils.ontology.ontology_transform import ONTOLOGIES, OntologyTransform
from kg_microbe.transform_utils.pdmetagenomics.pdmetagenomics import PdMetagenomicsTransform
from kg_microbe.transform_utils.rhea.rhea import RheaMappingsTransform
from kg_microbe.transform_utils.traits.traits import TraitsTransform
from kg_microbe.transform_utils.uniprot.uniprot import UniprotTransform
from kg_microbe.transform_utils.ontologies.ontologies_transform import (
ONTOLOGIES_MAP,
OntologiesTransform,
)
from kg_microbe.transform_utils.rheamappings.rheamappings import RheaMappingsTransform
from kg_microbe.transform_utils.uniprot_functional_microbes.uniprot_functional_microbes import (
UniprotFunctionalMicrobesTransform,
)
from kg_microbe.transform_utils.uniprot_human.uniprot_human import UniprotHumanTransform
from kg_microbe.transform_utils.wallen_etal.wallen_etal import WallenEtAlTransform

DATA_SOURCES = {
# "DrugCentralTransform": DrugCentralTransform,
Expand All @@ -25,17 +43,17 @@
# "TCRDTransform": TCRDTransform,
# "ProteinAtlasTransform": ProteinAtlasTransform,
# "STRINGTransform": STRINGTransform,
"OntologyTransform": OntologyTransform,
"BacDiveTransform": BacDiveTransform,
"MediaDiveTransform": MediaDiveTransform,
"TraitsTransform": TraitsTransform,
"RheaMappingsTransform": RheaMappingsTransform,
"BactoTraitsTransform": BactoTraitsTransform,
"UniprotHumanTransform": UniprotHumanTransform,
"CtdTransform": CtdTransform,
"DisbiomeTransform": DisbiomeTransform,
"PdMetagenomicsTransform": PdMetagenomicsTransform,
"UniprotTransform": UniprotTransform,
ONTOLOGIES: OntologiesTransform,
BACDIVE: BacDiveTransform,
MEDIADIVE: MediaDiveTransform,
MADIN_ETAL: MadinEtAlTransform,
RHEAMAPPINGS: RheaMappingsTransform,
BACTOTRAITS: BactoTraitsTransform,
UNIPROT_HUMAN: UniprotHumanTransform,
CTD: CTDTransform,
DISBIOME: DisbiomeTransform,
WALLEN_ETAL: WallenEtAlTransform,
UNIPROT_FUNCTIONAL_MICROBES: UniprotFunctionalMicrobesTransform,
}


Expand Down Expand Up @@ -65,7 +83,7 @@ def transform(
if source in DATA_SOURCES:
logging.info(f"Parsing {source}")
t = DATA_SOURCES[source](input_dir, output_dir)
if source in ONTOLOGIES.keys():
t.run(ONTOLOGIES[source])
if source in ONTOLOGIES_MAP.keys():
t.run(ONTOLOGIES_MAP[source])
else:
t.run(show_status=show_status)
3 changes: 2 additions & 1 deletion kg_microbe/transform_utils/bacdive/bacdive.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
ASSAY_TO_NCBI_EDGE,
ASSESSED_ACTIVITY_RELATIONSHIP,
ATTRIBUTE_CATEGORY,
BACDIVE,
BACDIVE_API_BASE_URL,
BACDIVE_ENVIRONMENT_CATEGORY,
BACDIVE_ID_COLUMN,
Expand Down Expand Up @@ -177,7 +178,7 @@ def __init__(
output_dir: Optional[Path] = None,
):
"""Instantiate part."""
source_name = "BacDive"
source_name = BACDIVE
super().__init__(source_name, input_dir, output_dir)
self.ncbi_impl = get_adapter(f"sqlite:{NCBITAXON_SOURCE}")

Expand Down
57 changes: 36 additions & 21 deletions kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,41 @@
import re
from pathlib import Path

# Source Names
MADIN_ETAL = "madin_etal"
BACDIVE = "bacdive"
MEDIADIVE = "mediadive"
BACTOTRAITS = "bactotraits"
RHEAMAPPINGS = "rheamappings"
ONTOLOGIES = "ontologies"
WALLEN_ETAL = "wallen_etal"
CTD = "ctd"
DISBIOME = "disbiome"
UNIPROT_FUNCTIONAL_MICROBES = "uniprot_functional_microbes"
UNIPROT_HUMAN = "uniprot_human"

TRANSFORM_UTILS_DIR = Path(__file__).parent
BACDIVE_DIR = TRANSFORM_UTILS_DIR / "bacdive"
BACDIVE_DIR = TRANSFORM_UTILS_DIR / BACDIVE
BACDIVE_TMP_DIR = BACDIVE_DIR / "tmp"
BACDIVE_YAML_DIR = BACDIVE_TMP_DIR / "yaml"
MEDIADIVE_DIR = TRANSFORM_UTILS_DIR / "mediadive"
MEDIADIVE_DIR = TRANSFORM_UTILS_DIR / MEDIADIVE
MEDIADIVE_TMP_DIR = MEDIADIVE_DIR / "tmp"
MEDIADIVE_MEDIUM_YAML_DIR = MEDIADIVE_TMP_DIR / "medium_yaml"
MEDIADIVE_MEDIUM_STRAIN_YAML_DIR = MEDIADIVE_TMP_DIR / "medium_strain_yaml"
TRAITS_DIR = TRANSFORM_UTILS_DIR / "traits"
MADIN_ETAL_DIR = TRANSFORM_UTILS_DIR / MADIN_ETAL
RAW_DATA_DIR = Path(__file__).parents[2] / "data" / "raw"
RHEA_DIR: Path = TRANSFORM_UTILS_DIR / "rhea"
RHEA_TMP_DIR = RHEA_DIR / "tmp"
RHEAMAPPINGS_DIR: Path = TRANSFORM_UTILS_DIR / RHEAMAPPINGS
RHEAMAPPINGS_TMP_DIR = RHEAMAPPINGS_DIR / "tmp"
BACTOTRAITS_DIR = TRANSFORM_UTILS_DIR / "bactotraits"
BACTOTRAITS_TMP_DIR = BACTOTRAITS_DIR / "tmp"
UNIPROT_TREMBL_DIR = TRANSFORM_UTILS_DIR / "uniprot_trembl"
UNIPROT_TREMBL_TMP_DIR = UNIPROT_TREMBL_DIR / "tmp"
ONTOLOGY_DIR = TRANSFORM_UTILS_DIR / "ontology"
ONTOLOGY_XREFS_DIR = ONTOLOGY_DIR / "xrefs"
ONTOLOGY_TREES_DIR = ONTOLOGY_DIR / "trees"
CHEBI_XREFS_FILEPATH = ONTOLOGY_XREFS_DIR / "chebi_xrefs.tsv"
MONDO_XREFS_FILEPATH = ONTOLOGY_XREFS_DIR / "mondo_xrefs.tsv"
MONDO_GENE_IDS_FILEPATH = ONTOLOGY_XREFS_DIR / "mondo_gene_ids.tsv"
ONTOLOGIES_DIR = TRANSFORM_UTILS_DIR / ONTOLOGIES
ONTOLOGIES_XREFS_DIR = ONTOLOGIES_DIR / "xrefs"
ONTOLOGIES_TREES_DIR = ONTOLOGIES_DIR / "trees"
CHEBI_XREFS_FILEPATH = ONTOLOGIES_XREFS_DIR / "chebi_xrefs.tsv"
MONDO_XREFS_FILEPATH = ONTOLOGIES_XREFS_DIR / "mondo_xrefs.tsv"
MONDO_GENE_IDS_FILEPATH = ONTOLOGIES_XREFS_DIR / "mondo_gene_ids.tsv"
CUSTOM_CURIES_YAML_FILE = TRANSFORM_UTILS_DIR / "custom_curies.yaml"
NCBITAXON_SOURCE = RAW_DATA_DIR / "ncbitaxon.owl"
CHEBI_SOURCE = RAW_DATA_DIR / "chebi.owl"
Expand Down Expand Up @@ -364,18 +377,20 @@
REPLACEMENT = "REPLACE"
SUPPLEMENT = "SUPPLEMENT"

CHEBI_MANUAL_ANNOTATION_PATH = TRAITS_DIR / "chebi_manual_annotation.tsv"
CHEBI_MANUAL_ANNOTATION_PATH = MADIN_ETAL_DIR / "chebi_manual_annotation.tsv"

# ROBOT
ROBOT_REMOVED_SUFFIX = "_removed_subset"
ROBOT_EXTRACT_SUFFIX = "_extract_subset"
EXCLUSION_TERMS_FILE = "exclusion_branches.tsv"

# Uniprot
UNIPROT_DIR = TRANSFORM_UTILS_DIR / "uniprot"
UNIPROT_TMP_DIR = UNIPROT_DIR / "tmp"
UNIPROT_RELEVANT_FILE_LIST = UNIPROT_TMP_DIR / "relevant_files.tsv"
UNIPROT_TMP_NE_DIR = UNIPROT_TMP_DIR / "nodes_and_edges"
UNIPROT_FUNCTIONAL_MICROBES_DIR = TRANSFORM_UTILS_DIR / UNIPROT_FUNCTIONAL_MICROBES
UNIPROT_FUNCTIONAL_MICROBES_TMP_DIR = UNIPROT_FUNCTIONAL_MICROBES_DIR / "tmp"
UNIPROT_FUNCTIONAL_MICROBES_RELEVANT_FILE_LIST = (
UNIPROT_FUNCTIONAL_MICROBES_TMP_DIR / "relevant_files.tsv"
)
UNIPROT_FUNCTIONAL_MICROBES_TMP_NE_DIR = UNIPROT_FUNCTIONAL_MICROBES_TMP_DIR / "nodes_and_edges"

# UniprotHuman
UNIPROT_HUMAN_DIR = TRANSFORM_UTILS_DIR / "uniprot_human"
Expand All @@ -387,7 +402,7 @@
UNIPROT_PROTEOMES_FILE = "uniprot_proteomes.tar.gz"
UNIPROT_HUMAN_FILE = "uniprot_human.tar.gz"
UNIPROT_S3_DIRECTORY = "s3"
GO_CATEGORY_TREES_FILE = ONTOLOGY_TREES_DIR / "go_category_trees.tsv"
GO_CATEGORY_TREES_FILE = ONTOLOGIES_TREES_DIR / "go_category_trees.tsv"

PROTEIN_CATEGORY = "biolink:Enzyme"
UNIPROT_FUNCTIONAL_MICROBES = "uniprot_functional_microbes"
Expand Down Expand Up @@ -524,7 +539,7 @@
COMBO_KEY = "combo"

# Unipathways
UNIPATHWAYS_XREFS_FILEPATH = ONTOLOGY_XREFS_DIR / "unipathways_xrefs.tsv"
UNIPATHWAYS_XREFS_FILEPATH = ONTOLOGIES_XREFS_DIR / "unipathways_xrefs.tsv"
UNIPATHWAYS_SHORT_PREFIX = "UPa"
UNIPATHWAYS_COMPOUND_PREFIX = "OBO:UPa_UPC"
UNIPATHWAYS_ENZYMATIC_REACTION_PREFIX = "OBO:UPa_UER"
Expand Down Expand Up @@ -614,6 +629,6 @@
ASSOCIATED_WITH_INCREASED_LIKELIHOOD_OF = ASSOCIATED_WITH
ASSOCIATED_WITH_DECREASED_LIKELIHOOD_OF = ASSOCIATED_WITH

# PD Metagenomics
PDMETAGENOMICS_DIR: Path = TRANSFORM_UTILS_DIR / "pdmetagenomics"
PDMETAGENOMICS_TMP_DIR = PDMETAGENOMICS_DIR / "tmp"
# Wallen etal
WALLEN_ETAL_DIR: Path = TRANSFORM_UTILS_DIR / WALLEN_ETAL
WALLEN_ETAL_TMP_DIR = WALLEN_ETAL_DIR / "tmp"
6 changes: 3 additions & 3 deletions kg_microbe/transform_utils/ctd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Ctd transform."""
"""CTD transform."""

from .ctd import CtdTransform
from .ctd import CTDTransform

__all__ = ["CtdTransform"]
__all__ = ["CTDTransform"]
4 changes: 2 additions & 2 deletions kg_microbe/transform_utils/ctd/ctd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Uniprot Transform class."""
"""CTD Transform class."""

import csv
import gzip
Expand Down Expand Up @@ -34,7 +34,7 @@
}


class CtdTransform(Transform):
class CTDTransform(Transform):

"""A class used to represent a transformation process for UniProt data."""

Expand Down
7 changes: 4 additions & 3 deletions kg_microbe/transform_utils/disbiome/disbiome.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ASSOCIATED_WITH_DECREASED_LIKELIHOOD_OF_PREDICATE,
ASSOCIATED_WITH_INCREASED_LIKELIHOOD_OF,
ASSOCIATED_WITH_INCREASED_LIKELIHOOD_OF_PREDICATE,
DISBIOME,
DISBIOME_DISEASE_NAME,
DISBIOME_ELEVATED,
DISBIOME_ORGANISM_ID,
Expand All @@ -25,8 +26,8 @@
NCBI_CATEGORY,
NCBITAXON_PREFIX,
)
from kg_microbe.transform_utils.pdmetagenomics.pdmetagenomics import MICROBE_NOT_FOUND_STR
from kg_microbe.transform_utils.transform import Transform
from kg_microbe.transform_utils.wallen_etal.wallen_etal import MICROBE_NOT_FOUND_STR
from kg_microbe.utils.pandas_utils import drop_duplicates


Expand All @@ -49,13 +50,13 @@ def __init__(self, input_dir: Optional[Path] = None, output_dir: Optional[Path]
If None, a default directory may be used.
:type output_dir: Optional[Path]
"""
source_name = "disbiome"
source_name = DISBIOME
super().__init__(source_name, input_dir, output_dir)

def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_status: bool = True):
"""Run DisbiomeTransform."""
if data_file is None:
data_file = "disbiome.json"
data_file = self.source_name + ".json"
input_file = self.input_base_dir / data_file

# Convert Disbiome taxa names to NCBITaxon IDs
Expand Down
5 changes: 5 additions & 0 deletions kg_microbe/transform_utils/madin_etal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""MadinEtAl transform."""

from .madin_etal import MadinEtAlTransform

__all__ = ["MadinEtAlTransform"]
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Transform the traits data from NCBI and GTDB."""
"""Transform the Madin etal data from NCBI and GTDB."""

import csv
from pathlib import Path
Expand Down Expand Up @@ -32,6 +32,7 @@
ISOLATION_SOURCE_COLUMN,
ISOLATION_SOURCE_PREFIX,
LOCATION_OF,
MADIN_ETAL,
METABOLISM_CATEGORY,
METABOLISM_COLUMN,
NAME_COLUMN,
Expand Down Expand Up @@ -68,10 +69,10 @@
PARENT_DIR = Path(__file__).resolve().parent


class TraitsTransform(Transform):
class MadinEtAlTransform(Transform):

"""
Ingest traits dataset (NCBI/GTDB).
Ingest Madin et al dataset (NCBI/GTDB).
Essentially just ingests and transforms this file:
https://github.com/bacteria-archaea-traits/bacteria-archaea-traits/blob/master/output/condensed_traits_NCBI.csv
Expand All @@ -91,12 +92,12 @@ class TraitsTransform(Transform):

def __init__(self, input_dir: str, output_dir: str, nlp=True) -> None:
"""
Initialize TraitsTransform Class.
Initialize MadinEtAlTransform Class.
:param input_dir: Input file path (str)
:param output_dir: Output file path (str)
"""
source_name = "traits"
source_name = MADIN_ETAL
super().__init__(source_name, input_dir, output_dir, nlp) # set some variables
self.nlp = nlp
self.metabolism_map_yaml = PARENT_DIR / "metabolism_map.yaml"
Expand Down
Loading

0 comments on commit a78e1c0

Please sign in to comment.