From fb015e0eb441f3613f488b43eace65c1a862cf3a Mon Sep 17 00:00:00 2001 From: Puja Trivedi Date: Mon, 11 Mar 2024 11:18:14 -0700 Subject: [PATCH] updated scripts/README.md with linkml_trimmer details --- bkbit/scripts/README.md | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/bkbit/scripts/README.md b/bkbit/scripts/README.md index a538162..f224d2a 100644 --- a/bkbit/scripts/README.md +++ b/bkbit/scripts/README.md @@ -1,29 +1,28 @@ -# gfftranslator.py - -gfftranslator.py is a Python script that generates GeneAnnotation objects from data stored in gff files. +# linkml_trimmer.py +linkml_trimmer returns a trimmed version of a linkml model. ## Usage ```python -from bkbit.scripts.gfftranslator import gff_to_gene_annotation +# Step 1: import YamlTrimmer +from bkbit.scripts.linkml_trimmer import YamlTrimmer -# input_fname is the name of the input csv file -# Note: example input data can be found on Allen Teams under Knowledge Graph files. "20230412_subset_genome_annotation.csv" -input_fname = 'XXX.csv' +# Step 2: initialize YamlTrimmer Object with a linkml model +trimmed_model = YamlTrimmer(path_to_linkml_model) -# data_dir is the directory path where the input csv file exists -data_dir = ' XXX/XXX/' +# Step 3: define the classes, slots, and enums that should be included in the trimmed model +classes = [...] # List of classes to keep +slots = [...] # List of slots to keep +enums = [...] # List of enums to keep -# output_dir is the directory path where all of the generated output files will be saved -# Note: if output_dir does not exist, gff_to_gene_annotation will create the directory -output_dir = 'XXX/XXX/' +# Step 4: call the trim_model function with the selected classes/slots/enums +# Note: only classes is a required parameter. slots and enums are optional +trimmed_model.trim_model(classes, slots, enums) -gff_to_gene_annotation(input_fname, data_dir, output_dir) +# Step 5: call the serialize function to produce trimmed linkml model +trimmed_model.serialize() ``` ## Notes -1. Input csv file -a. Each row in the csv file contains a url to the .gff file as well as additional attributes to describe the dataset. The csv file must contain the following columns: authority, label, taxon_local_unique_identifier, version, gene_identifier_prefix, url. -2. Generated files -a. For each .gff file 3 files will be generated: (i) The raw data downloaded from the url provided will be saved as a csv file in the 'data_dir' directory provided. (ii) The parsed and cleaned data will be saved as a csv file in the 'output_dir' directory provided. (iii) The initialized GeneAnnotation objects will be saved as a list of json dictionaries in a json file in the 'output_dir' directory provided. +1. To produce bican_biolink.yaml call trim_model with classes = ['gene', 'genome', 'organism taxon', 'thing with taxon', 'material sample', 'procedure', 'entity', 'activity', 'named thing']