Skip to content

Commit

Permalink
Merge pull request #27 from VirtualFlyBrain/FBco_switch
Browse files Browse the repository at this point in the history
refactor to use FBco and FBal
  • Loading branch information
Clare72 authored May 17, 2024
2 parents e67a7bc + f828cc5 commit 1c67620
Show file tree
Hide file tree
Showing 11 changed files with 38,193 additions and 380,082 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/auto_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- name: Run ontology release
env:
DEFAULT_BRANCH: master
run: cd src/ontology && make ROBOT_ENV='ROBOT_JAVA_ARGS=-Xmx6G' update_ontology -B && make ROBOT_ENV='ROBOT_JAVA_ARGS=-Xmx6G' prepare_release -B
run: cd src/ontology && make ROBOT_ENV='ROBOT_JAVA_ARGS=-Xmx6G' prepare_release -B

- name: Fix repo ownership issue
run: git config --global --add safe.directory /__w/vfb-driver-ontology/vfb-driver-ontology
Expand Down
8 changes: 5 additions & 3 deletions src/ontology/VFB_drivers-annotations.ofn
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ Prefix(rdfs:=<http://www.w3.org/2000/01/rdf-schema#>)

Ontology(<http://virtualflybrain.org/data/VFB/OWL/VFB_drivers.owl>
Import(<http://purl.obolibrary.org/obo/VFB_drivers/imports/merged_import.owl>)
Annotation(<http://purl.org/dc/elements/1.1/description> "An ontology of Drosophila melanogaster drivers and expression patterns. Split expression pattern classes are taken from Virtual Fly Brain, where these were created and linked to FlyBase terms for hemidrivers. Hemidriver classes are created using data taken directly from FlyBase. Other driver classes are retrieved from Virtual Fly Brain, which creates these using data from FlyBase."^^xsd:string)
Annotation(<http://purl.org/dc/elements/1.1/title> "VFB Driver Ontology"^^xsd:string)
Annotation(<http://purl.org/dc/elements/1.1/description> "An ontology of Drosophila melanogaster drivers and expression patterns. All genetic objects are created using data taken directly from FlyBase.")
Annotation(<http://purl.org/dc/elements/1.1/title> "VFB Driver Ontology")
Annotation(<http://purl.org/dc/terms/contributor> "http://orcid.org/0000-0002-1373-1705")
Annotation(<http://purl.org/dc/terms/license> "https://creativecommons.org/licenses/by/4.0/"^^xsd:string)
Annotation(<http://purl.org/dc/terms/license> "https://creativecommons.org/licenses/by/4.0/")

Declaration(Class(<http://purl.obolibrary.org/obo/fbbt/vfb/VFBext_0000010>))
Declaration(ObjectProperty(<http://purl.obolibrary.org/obo/fbbt/vfb/VFBext_0000008>))
Declaration(AnnotationProperty(<http://purl.org/dc/elements/1.1/description>))
Declaration(AnnotationProperty(<http://purl.org/dc/elements/1.1/title>))
Declaration(AnnotationProperty(<http://purl.org/dc/terms/license>))
Expand Down
417,730 changes: 37,979 additions & 379,751 deletions src/ontology/VFB_drivers-edit.owl

Large diffs are not rendered by default.

34 changes: 14 additions & 20 deletions src/ontology/VFB_drivers.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,26 @@ prepare_release: all $(ONT)-cedar.owl $(REPORTDIR)/robot_diff.txt

CLEANFILES:=$(CLEANFILES) $(ONT)-cedar.owl $(IMPORTDIR)/*_terms_combined.txt

.PHONY: get_FB_hemidrivers
get_FB_hemidrivers: $(TMPDIR)
# get hemidriver data from public chado
$(TMPDIR)/FB_data.tsv: | $(TMPDIR)
apt-get update
apt-get -y install postgresql-client
psql -h chado.flybase.org -U flybase flybase -f ../sql/hemidrivers.sql > $(TMPDIR)/hemidrivers.tsv
psql -h chado.flybase.org -U flybase flybase -f ../sql/FB_query.sql > $(TMPDIR)/FB_data.tsv

.PHONY: update_ontology
update_ontology: get_FB_hemidrivers
python3 -m pip install -r $(SCRIPTSDIR)/requirements.txt && \
python3 $(SCRIPTSDIR)/update_ontology.py &&\
$(ROBOT) template \
--template properties_template.tsv \
--output $(TMPDIR)/VFB_drivers-properties-tmp.owl &&\
$(TMPDIR)/template.tsv: | $(TMPDIR)
python3 $(SCRIPTSDIR)/process_FB_data.py &&\
python3 $(SCRIPTSDIR)/make_template.py

$(SRC): $(TMPDIR)/FB_data.tsv $(TMPDIR)/template.tsv
$(ROBOT) template \
--input $(TMPDIR)/VFB_drivers-properties-tmp.owl \
--template template.tsv \
--output $(TMPDIR)/VFB_drivers-classes-tmp.owl &&\
$(ROBOT) merge \
--merge-before \
--input VFB_drivers-annotations.ofn \
--input $(TMPDIR)/VFB_drivers-properties-tmp.owl \
--input $(TMPDIR)/VFB_drivers-classes-tmp.owl \
--include-annotations true --collapse-import-closure false \
--output VFB_drivers-edit.owl &&\
--prefix "FlyBase: http://flybase.org/reports/" \
--prefix "VFBext: http://purl.obolibrary.org/obo/fbbt/vfb/VFBext_" \
--template $(TMPDIR)/template.tsv \
--include-annotations true \
--collapse-import-closure false \
--output $@ &&\
echo "\nOntology source file updated!\n"
rm template.tsv properties_template.tsv $(TMPDIR)/VFB_drivers-properties-tmp.owl $(TMPDIR)/VFB_drivers-classes-tmp.owl

$(ONT).owl: $(ONT)-full.owl
grep -v owl:versionIRI $< > $@.tmp.owl
Expand Down
3 changes: 0 additions & 3 deletions src/ontology/run_release.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
# update edit file
sh run.sh make update_ontology -B

# 2. run release
sh run.sh make prepare_release -B
28 changes: 28 additions & 0 deletions src/scripts/make_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
from collections import OrderedDict


FB_data = pd.read_csv('tmp/FB_data_processed.tsv', sep='\t', low_memory=False)

FBco_data = FB_data[['combo_id', 'combo_name', 'combo_symbol', 'combo_synonyms', 'allele_id']]
FBco_data = FBco_data.rename(columns={'combo_id':'ID', 'combo_name':'Name', 'combo_symbol': 'Symbol', 'combo_synonyms':'Synonyms', 'allele_id':'Hemidriver'})
FBco_data['Parent'] = 'VFBext:0000010'
FBco_data['TYPE'] = 'owl:Class'
FBco_data['Xref'] = 'FlyBase'

FBal_data = FB_data[['allele_id', 'allele_name', 'allele_synonyms', 'tool_fbcv']]
FBal_data = FBal_data.rename(columns={'allele_id':'ID', 'allele_name':'Name', 'allele_synonyms':'Synonyms', 'tool_fbcv':'Parent'})
FBal_data['TYPE'] = 'owl:Class'
FBal_data['Xref'] = 'FlyBase'

template_header = pd.DataFrame({'ID':['ID'], 'TYPE': ['TYPE'],
"Name": ["A rdfs:label SPLIT=|"],
"Definition": ["A IAO:0000115"],
"Synonyms": ["A oboInOwl:hasExactSynonym SPLIT=|"],
"Symbol": ["A IAO:0000028"], "Parent": ["C % SPLIT=|"],
"Hemidriver": ["C VFBext:0000008 some %"],
"Xref": ["A oboInOwl:hasDbXref"]})

template = pd.concat([template_header, FBco_data, FBal_data], ignore_index=True)

template.to_csv('tmp/template.tsv', sep='\t', index=None)
89 changes: 89 additions & 0 deletions src/scripts/process_FB_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pandas as pd
from oaklib import get_adapter
import re

FB_data = pd.read_csv('tmp/FB_data.tsv', sep='\t', low_memory=False)

# check that each FBco has at least two alleles (rows)
# NB not restricting to exactly two
if len(FB_data[~FB_data['combo_id'].duplicated(keep=False)]) > 0:
raise ValueError('Single allele combinations present:', FB_data[~FB_data['combo_id'].duplicated(keep=False)]['combo_id'].to_list())

# Check that only one label per FBco and FBal
if len(FB_data[FB_data['combo_name'].str.contains('\|')]) > 0:
raise ValueError('Multiple labels for combination:', FB_data[FB_data['combo_name'].str.contains('\|')][['combo_id', 'combo_name']])

if len(FB_data[FB_data['allele_name'].str.contains('\|')]) > 0:
raise ValueError('Multiple labels for combination:', FB_data[FB_data['allele_name'].str.contains('\|')][['allele_id', 'allele_name']])

# remove any synonyms that are duplicates of the label
def process_synonyms(object_name: str, synonyms: str):
"""Remove object_name from synonyms if possible, otherwise do nothing."""
try:
synonyms = synonyms.split('|')
synonyms.remove(object_name)
return '|'.join(synonyms)
except:
pass

FB_data['combo_synonyms'] = FB_data.apply(lambda x: process_synonyms(x.combo_name, x.combo_synonyms), axis=1)
FB_data['allele_synonyms'] = FB_data.apply(lambda x: process_synonyms(x.allele_name, x.allele_synonyms), axis=1)

# replace INTERSECTION with symbol in split names
FB_data['combo_name'] = FB_data['combo_name'].map(lambda x: x.replace('INTERSECTION', '∩'))

# merge direct and indirect tools and FBcv terms
def join_direct_and_indirect(allele: str, direct: str, indirect: str):
try:
return '|'.join([direct, indirect])
except TypeError:
if type(direct) == str:
return direct
elif type(indirect) == str:
return indirect
else:
raise ValueError(f'No direct or indirect tools for {allele}')

FB_data['tool_id'] = FB_data.apply(lambda x: join_direct_and_indirect(x.allele_id, x.direct_tool_id, x.indirect_tool_id), axis=1)
FB_data['tool_fbcv'] = FB_data.apply(lambda x: join_direct_and_indirect(x.allele_id, x.direct_tool_fbcv, x.indirect_tool_fbcv), axis=1)
FB_data = FB_data.drop(['direct_tool_id', 'indirect_tool_id', 'direct_tool_fbcv', 'indirect_tool_fbcv'], axis=1)

# remove any FBcv terms that are not SC FBcv_0009027 'split driver fragment'
FBcv_adapter = get_adapter("sqlite:obo:fbcv")
split_IDs = [i for i in FBcv_adapter.descendants('FBcv:0009027')]

def process_split_cv_terms(cv_terms: str, allele_ID: str, ID_list=split_IDs):
"""Keep only FBcv terms that define split components, error if there are none."""
cv_terms = cv_terms.split('|')
cv_terms = [c for c in cv_terms if c in ID_list]
if cv_terms:
return '|'.join(cv_terms)
else:
raise ValueError(f"No valid FBcv terms for {allele_ID}")

FB_data['tool_fbcv'] = FB_data.apply(lambda x: process_split_cv_terms(x.tool_fbcv, x.allele_id), axis=1)

# try to choose a symbol for each FBco

symbol_pattern = re.compile('^([A-Z]{2}[0-9]+)[A-Z]?$')
def choose_symbol(synonyms: str):
"""Pick a symbol from the among synonyms if a single suitable synonym exists."""
try:
synonyms = synonyms.split('|')
except AttributeError:
return ""
matches = [symbol_pattern.match(s) for s in synonyms if symbol_pattern.match(s)]
if len(matches)==1:
return matches[0].group(0)
elif len(matches)>1:
matches_shortened = list(set([m.group(1) for m in matches]))
if len(matches_shortened)==1:
return matches_shortened[0]
else:
return ""
else:
return ""

FB_data['combo_symbol'] = FB_data.apply(lambda x: choose_symbol(x.combo_synonyms), axis=1)

FB_data.to_csv('tmp/FB_data_processed.tsv', sep='\t', index=None)
3 changes: 0 additions & 3 deletions src/scripts/requirements.txt

This file was deleted.

Loading

0 comments on commit 1c67620

Please sign in to comment.