-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from puja-trivedi/linkml_trimmer
Linkml Trimmer
- Loading branch information
Showing
3 changed files
with
133 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,28 @@ | ||
# gfftranslator.py | ||
|
||
gfftranslator.py is a Python script that generates GeneAnnotation objects from data stored in gff files. | ||
# linkml_trimmer.py | ||
|
||
linkml_trimmer returns a trimmed version of a linkml model. | ||
## Usage | ||
|
||
```python | ||
from bkbit.scripts.gfftranslator import gff_to_gene_annotation | ||
# Step 1: import YamlTrimmer | ||
from bkbit.scripts.linkml_trimmer import YamlTrimmer | ||
|
||
# input_fname is the name of the input csv file | ||
# Note: example input data can be found on Allen Teams under Knowledge Graph files. "20230412_subset_genome_annotation.csv" | ||
input_fname = 'XXX.csv' | ||
# Step 2: initialize YamlTrimmer Object with a linkml model | ||
trimmed_model = YamlTrimmer(path_to_linkml_model) | ||
|
||
# data_dir is the directory path where the input csv file exists | ||
data_dir = ' XXX/XXX/' | ||
# Step 3: define the classes, slots, and enums that should be included in the trimmed model | ||
classes = [...] # List of classes to keep | ||
slots = [...] # List of slots to keep | ||
enums = [...] # List of enums to keep | ||
|
||
# output_dir is the directory path where all of the generated output files will be saved | ||
# Note: if output_dir does not exist, gff_to_gene_annotation will create the directory | ||
output_dir = 'XXX/XXX/' | ||
# Step 4: call the trim_model function with the selected classes/slots/enums | ||
# Note: only classes is a required parameter. slots and enums are optional | ||
trimmed_model.trim_model(classes, slots, enums) | ||
|
||
gff_to_gene_annotation(input_fname, data_dir, output_dir) | ||
# Step 5: call the serialize function to produce trimmed linkml model | ||
trimmed_model.serialize() | ||
``` | ||
|
||
## Notes | ||
|
||
1. Input csv file | ||
a. Each row in the csv file contains a url to the .gff file as well as additional attributes to describe the dataset. The csv file must contain the following columns: authority, label, taxon_local_unique_identifier, version, gene_identifier_prefix, url. | ||
2. Generated files | ||
a. For each .gff file 3 files will be generated: (i) The raw data downloaded from the url provided will be saved as a csv file in the 'data_dir' directory provided. (ii) The parsed and cleaned data will be saved as a csv file in the 'output_dir' directory provided. (iii) The initialized GeneAnnotation objects will be saved as a list of json dictionaries in a json file in the 'output_dir' directory provided. | ||
1. To produce bican_biolink.yaml call trim_model with classes = ['gene', 'genome', 'organism taxon', 'thing with taxon', 'material sample', 'procedure', 'entity', 'activity', 'named thing'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +0,0 @@ | ||
from .gfftranslator import * | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from dataclasses import dataclass | ||
from typing import Union | ||
from pathlib import Path | ||
from linkml_runtime.linkml_model.meta import SchemaDefinition | ||
from linkml_runtime.utils.schemaview import SchemaView | ||
|
||
from linkml._version import __version__ | ||
from linkml.generators.yamlgen import YAMLGenerator | ||
|
||
|
||
@dataclass | ||
class YamlTrimmer: | ||
def __init__(self, schema: Union[str, Path, SchemaDefinition]): | ||
self.schemaview = SchemaView(schema) | ||
|
||
def trim_model( | ||
self, | ||
keep_classes: list[str], | ||
keep_slots: list[str] = [], | ||
keep_enums: list[str] = [], | ||
): | ||
""" | ||
Trims the model by removing classes, slots, and enums that are not reachable from the specified keep_classes, keep_slots, and keep_enums. | ||
Args: | ||
keep_classes (list[str]): List of classes to keep. | ||
keep_slots (list[str], optional): List of slots to keep. Defaults to []. | ||
keep_enums (list[str], optional): List of enums to keep. Defaults to []. | ||
""" | ||
sv = self.schemaview | ||
# vistited_classes, visited_enums, and visited slots keep track of the classes, enums, and slots that are reachable from the input class, slots, and enums we are interested in keeping | ||
visited_classes = set() | ||
visited_slots = set() | ||
visited_enums = set() | ||
|
||
# stack is a list of classes, enums, and slots that we will traverse to find all reachable classes, enums, and slots | ||
stack = [] | ||
stack.extend(keep_classes) | ||
stack.extend(keep_slots) | ||
stack.extend(keep_enums) | ||
|
||
# all_classes, all_enums, and all_slots are the set of all classes, enums, and slots defined in the given schema | ||
all_classes = set(sv.all_classes(imports=False)) | ||
all_enums = set(sv.all_enums(imports=False)) | ||
all_slots = set(sv.all_slots(imports=False, attributes=False)) | ||
|
||
while stack: | ||
curr_node = stack.pop() | ||
if ( | ||
curr_node in visited_classes | ||
or curr_node in visited_enums | ||
or curr_node in visited_slots | ||
): | ||
continue | ||
|
||
# if curr_node is a class | ||
if curr_node in all_classes: | ||
visited_classes.add(curr_node) | ||
# add parent classes to stack | ||
for inherited_class in sv.class_parents(curr_node, imports=False): | ||
if ( | ||
inherited_class not in visited_classes | ||
and inherited_class in all_classes | ||
): | ||
stack.append(inherited_class) | ||
|
||
# iterate through attributes/slots and add respective range to stack if type is a class or enum | ||
for slot in sv.class_slots( | ||
curr_node, imports=False, direct=True, attributes=True | ||
): | ||
if slot not in visited_slots and slot in all_slots: | ||
stack.append(slot) | ||
|
||
elif curr_node in all_slots: | ||
visited_slots.add(curr_node) | ||
for slot_range in sv.slot_range_as_union( | ||
sv.get_slot(curr_node, strict=True) | ||
): | ||
if ( | ||
slot_range in all_classes and slot_range not in visited_classes | ||
) or (slot_range in all_enums and slot_range not in visited_enums): | ||
stack.append(slot_range) | ||
for parent_slot in sv.slot_parents(curr_node, imports=False): | ||
if parent_slot not in visited_slots and parent_slot in all_slots: | ||
stack.append(parent_slot) | ||
|
||
elif curr_node in all_enums: | ||
visited_enums.add(curr_node) | ||
# add parent classes to stack | ||
for parent_enum in sv.enum_parents(curr_node, imports=False): | ||
if parent_enum not in visited_enums and parent_enum in all_enums: | ||
stack.append(parent_enum) | ||
|
||
else: | ||
raise ValueError( | ||
f"ERROR: {curr_node} not found in schema classes, slots, or enums" | ||
) | ||
|
||
for c in all_classes: | ||
if c not in visited_classes: | ||
sv.delete_class(c) | ||
for e in all_enums: | ||
if e not in visited_enums: | ||
sv.delete_enum(e) | ||
for s in all_slots: | ||
if s not in visited_slots: | ||
sv.delete_slot(s) | ||
|
||
def serialize(self): | ||
""" | ||
Serializes the schema using YAMLGenerator and prints the serialized output. | ||
""" | ||
print(YAMLGenerator(self.schemaview.schema).serialize()) | ||
|
||
|
||
if __name__ == "__main__": | ||
pass |