Skip to content

Commit

Permalink
Merge pull request #12 from puja-trivedi/linkml_trimmer
Browse files Browse the repository at this point in the history
Linkml Trimmer
  • Loading branch information
djarecka authored Mar 11, 2024
2 parents 18f7d64 + fb015e0 commit 1b83495
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 18 deletions.
33 changes: 16 additions & 17 deletions bkbit/scripts/README.md
Original file line number Diff line number Diff line change
@@ -1,29 +1,28 @@
# gfftranslator.py

gfftranslator.py is a Python script that generates GeneAnnotation objects from data stored in gff files.
# linkml_trimmer.py

linkml_trimmer returns a trimmed version of a linkml model.
## Usage

```python
from bkbit.scripts.gfftranslator import gff_to_gene_annotation
# Step 1: import YamlTrimmer
from bkbit.scripts.linkml_trimmer import YamlTrimmer

# input_fname is the name of the input csv file
# Note: example input data can be found on Allen Teams under Knowledge Graph files. "20230412_subset_genome_annotation.csv"
input_fname = 'XXX.csv'
# Step 2: initialize YamlTrimmer Object with a linkml model
trimmed_model = YamlTrimmer(path_to_linkml_model)

# data_dir is the directory path where the input csv file exists
data_dir = ' XXX/XXX/'
# Step 3: define the classes, slots, and enums that should be included in the trimmed model
classes = [...] # List of classes to keep
slots = [...] # List of slots to keep
enums = [...] # List of enums to keep

# output_dir is the directory path where all of the generated output files will be saved
# Note: if output_dir does not exist, gff_to_gene_annotation will create the directory
output_dir = 'XXX/XXX/'
# Step 4: call the trim_model function with the selected classes/slots/enums
# Note: only classes is a required parameter. slots and enums are optional
trimmed_model.trim_model(classes, slots, enums)

gff_to_gene_annotation(input_fname, data_dir, output_dir)
# Step 5: call the serialize function to produce trimmed linkml model
trimmed_model.serialize()
```

## Notes

1. Input csv file
a. Each row in the csv file contains a url to the .gff file as well as additional attributes to describe the dataset. The csv file must contain the following columns: authority, label, taxon_local_unique_identifier, version, gene_identifier_prefix, url.
2. Generated files
a. For each .gff file 3 files will be generated: (i) The raw data downloaded from the url provided will be saved as a csv file in the 'data_dir' directory provided. (ii) The parsed and cleaned data will be saved as a csv file in the 'output_dir' directory provided. (iii) The initialized GeneAnnotation objects will be saved as a list of json dictionaries in a json file in the 'output_dir' directory provided.
1. To produce bican_biolink.yaml call trim_model with classes = ['gene', 'genome', 'organism taxon', 'thing with taxon', 'material sample', 'procedure', 'entity', 'activity', 'named thing']
1 change: 0 additions & 1 deletion bkbit/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
from .gfftranslator import *
117 changes: 117 additions & 0 deletions bkbit/scripts/linkml_trimmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from dataclasses import dataclass
from typing import Union
from pathlib import Path
from linkml_runtime.linkml_model.meta import SchemaDefinition
from linkml_runtime.utils.schemaview import SchemaView

from linkml._version import __version__
from linkml.generators.yamlgen import YAMLGenerator


@dataclass
class YamlTrimmer:
def __init__(self, schema: Union[str, Path, SchemaDefinition]):
self.schemaview = SchemaView(schema)

def trim_model(
self,
keep_classes: list[str],
keep_slots: list[str] = [],
keep_enums: list[str] = [],
):
"""
Trims the model by removing classes, slots, and enums that are not reachable from the specified keep_classes, keep_slots, and keep_enums.
Args:
keep_classes (list[str]): List of classes to keep.
keep_slots (list[str], optional): List of slots to keep. Defaults to [].
keep_enums (list[str], optional): List of enums to keep. Defaults to [].
"""
sv = self.schemaview
# vistited_classes, visited_enums, and visited slots keep track of the classes, enums, and slots that are reachable from the input class, slots, and enums we are interested in keeping
visited_classes = set()
visited_slots = set()
visited_enums = set()

# stack is a list of classes, enums, and slots that we will traverse to find all reachable classes, enums, and slots
stack = []
stack.extend(keep_classes)
stack.extend(keep_slots)
stack.extend(keep_enums)

# all_classes, all_enums, and all_slots are the set of all classes, enums, and slots defined in the given schema
all_classes = set(sv.all_classes(imports=False))
all_enums = set(sv.all_enums(imports=False))
all_slots = set(sv.all_slots(imports=False, attributes=False))

while stack:
curr_node = stack.pop()
if (
curr_node in visited_classes
or curr_node in visited_enums
or curr_node in visited_slots
):
continue

# if curr_node is a class
if curr_node in all_classes:
visited_classes.add(curr_node)
# add parent classes to stack
for inherited_class in sv.class_parents(curr_node, imports=False):
if (
inherited_class not in visited_classes
and inherited_class in all_classes
):
stack.append(inherited_class)

# iterate through attributes/slots and add respective range to stack if type is a class or enum
for slot in sv.class_slots(
curr_node, imports=False, direct=True, attributes=True
):
if slot not in visited_slots and slot in all_slots:
stack.append(slot)

elif curr_node in all_slots:
visited_slots.add(curr_node)
for slot_range in sv.slot_range_as_union(
sv.get_slot(curr_node, strict=True)
):
if (
slot_range in all_classes and slot_range not in visited_classes
) or (slot_range in all_enums and slot_range not in visited_enums):
stack.append(slot_range)
for parent_slot in sv.slot_parents(curr_node, imports=False):
if parent_slot not in visited_slots and parent_slot in all_slots:
stack.append(parent_slot)

elif curr_node in all_enums:
visited_enums.add(curr_node)
# add parent classes to stack
for parent_enum in sv.enum_parents(curr_node, imports=False):
if parent_enum not in visited_enums and parent_enum in all_enums:
stack.append(parent_enum)

else:
raise ValueError(
f"ERROR: {curr_node} not found in schema classes, slots, or enums"
)

for c in all_classes:
if c not in visited_classes:
sv.delete_class(c)
for e in all_enums:
if e not in visited_enums:
sv.delete_enum(e)
for s in all_slots:
if s not in visited_slots:
sv.delete_slot(s)

def serialize(self):
"""
Serializes the schema using YAMLGenerator and prints the serialized output.
"""
print(YAMLGenerator(self.schemaview.schema).serialize())


if __name__ == "__main__":
pass

0 comments on commit 1b83495

Please sign in to comment.