-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #27 from VirtualFlyBrain/FBco_switch
refactor to use FBco and FBal
- Loading branch information
Showing
11 changed files
with
38,193 additions
and
380,082 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,2 @@ | ||
# update edit file | ||
sh run.sh make update_ontology -B | ||
|
||
# 2. run release | ||
sh run.sh make prepare_release -B |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import pandas as pd | ||
from collections import OrderedDict | ||
|
||
|
||
FB_data = pd.read_csv('tmp/FB_data_processed.tsv', sep='\t', low_memory=False) | ||
|
||
FBco_data = FB_data[['combo_id', 'combo_name', 'combo_symbol', 'combo_synonyms', 'allele_id']] | ||
FBco_data = FBco_data.rename(columns={'combo_id':'ID', 'combo_name':'Name', 'combo_symbol': 'Symbol', 'combo_synonyms':'Synonyms', 'allele_id':'Hemidriver'}) | ||
FBco_data['Parent'] = 'VFBext:0000010' | ||
FBco_data['TYPE'] = 'owl:Class' | ||
FBco_data['Xref'] = 'FlyBase' | ||
|
||
FBal_data = FB_data[['allele_id', 'allele_name', 'allele_synonyms', 'tool_fbcv']] | ||
FBal_data = FBal_data.rename(columns={'allele_id':'ID', 'allele_name':'Name', 'allele_synonyms':'Synonyms', 'tool_fbcv':'Parent'}) | ||
FBal_data['TYPE'] = 'owl:Class' | ||
FBal_data['Xref'] = 'FlyBase' | ||
|
||
template_header = pd.DataFrame({'ID':['ID'], 'TYPE': ['TYPE'], | ||
"Name": ["A rdfs:label SPLIT=|"], | ||
"Definition": ["A IAO:0000115"], | ||
"Synonyms": ["A oboInOwl:hasExactSynonym SPLIT=|"], | ||
"Symbol": ["A IAO:0000028"], "Parent": ["C % SPLIT=|"], | ||
"Hemidriver": ["C VFBext:0000008 some %"], | ||
"Xref": ["A oboInOwl:hasDbXref"]}) | ||
|
||
template = pd.concat([template_header, FBco_data, FBal_data], ignore_index=True) | ||
|
||
template.to_csv('tmp/template.tsv', sep='\t', index=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import pandas as pd | ||
from oaklib import get_adapter | ||
import re | ||
|
||
FB_data = pd.read_csv('tmp/FB_data.tsv', sep='\t', low_memory=False) | ||
|
||
# check that each FBco has at least two alleles (rows) | ||
# NB not restricting to exactly two | ||
if len(FB_data[~FB_data['combo_id'].duplicated(keep=False)]) > 0: | ||
raise ValueError('Single allele combinations present:', FB_data[~FB_data['combo_id'].duplicated(keep=False)]['combo_id'].to_list()) | ||
|
||
# Check that only one label per FBco and FBal | ||
if len(FB_data[FB_data['combo_name'].str.contains('\|')]) > 0: | ||
raise ValueError('Multiple labels for combination:', FB_data[FB_data['combo_name'].str.contains('\|')][['combo_id', 'combo_name']]) | ||
|
||
if len(FB_data[FB_data['allele_name'].str.contains('\|')]) > 0: | ||
raise ValueError('Multiple labels for combination:', FB_data[FB_data['allele_name'].str.contains('\|')][['allele_id', 'allele_name']]) | ||
|
||
# remove any synonyms that are duplicates of the label | ||
def process_synonyms(object_name: str, synonyms: str): | ||
"""Remove object_name from synonyms if possible, otherwise do nothing.""" | ||
try: | ||
synonyms = synonyms.split('|') | ||
synonyms.remove(object_name) | ||
return '|'.join(synonyms) | ||
except: | ||
pass | ||
|
||
FB_data['combo_synonyms'] = FB_data.apply(lambda x: process_synonyms(x.combo_name, x.combo_synonyms), axis=1) | ||
FB_data['allele_synonyms'] = FB_data.apply(lambda x: process_synonyms(x.allele_name, x.allele_synonyms), axis=1) | ||
|
||
# replace INTERSECTION with symbol in split names | ||
FB_data['combo_name'] = FB_data['combo_name'].map(lambda x: x.replace('INTERSECTION', '∩')) | ||
|
||
# merge direct and indirect tools and FBcv terms | ||
def join_direct_and_indirect(allele: str, direct: str, indirect: str): | ||
try: | ||
return '|'.join([direct, indirect]) | ||
except TypeError: | ||
if type(direct) == str: | ||
return direct | ||
elif type(indirect) == str: | ||
return indirect | ||
else: | ||
raise ValueError(f'No direct or indirect tools for {allele}') | ||
|
||
FB_data['tool_id'] = FB_data.apply(lambda x: join_direct_and_indirect(x.allele_id, x.direct_tool_id, x.indirect_tool_id), axis=1) | ||
FB_data['tool_fbcv'] = FB_data.apply(lambda x: join_direct_and_indirect(x.allele_id, x.direct_tool_fbcv, x.indirect_tool_fbcv), axis=1) | ||
FB_data = FB_data.drop(['direct_tool_id', 'indirect_tool_id', 'direct_tool_fbcv', 'indirect_tool_fbcv'], axis=1) | ||
|
||
# remove any FBcv terms that are not SC FBcv_0009027 'split driver fragment' | ||
FBcv_adapter = get_adapter("sqlite:obo:fbcv") | ||
split_IDs = [i for i in FBcv_adapter.descendants('FBcv:0009027')] | ||
|
||
def process_split_cv_terms(cv_terms: str, allele_ID: str, ID_list=split_IDs): | ||
"""Keep only FBcv terms that define split components, error if there are none.""" | ||
cv_terms = cv_terms.split('|') | ||
cv_terms = [c for c in cv_terms if c in ID_list] | ||
if cv_terms: | ||
return '|'.join(cv_terms) | ||
else: | ||
raise ValueError(f"No valid FBcv terms for {allele_ID}") | ||
|
||
FB_data['tool_fbcv'] = FB_data.apply(lambda x: process_split_cv_terms(x.tool_fbcv, x.allele_id), axis=1) | ||
|
||
# try to choose a symbol for each FBco | ||
|
||
symbol_pattern = re.compile('^([A-Z]{2}[0-9]+)[A-Z]?$') | ||
def choose_symbol(synonyms: str): | ||
"""Pick a symbol from the among synonyms if a single suitable synonym exists.""" | ||
try: | ||
synonyms = synonyms.split('|') | ||
except AttributeError: | ||
return "" | ||
matches = [symbol_pattern.match(s) for s in synonyms if symbol_pattern.match(s)] | ||
if len(matches)==1: | ||
return matches[0].group(0) | ||
elif len(matches)>1: | ||
matches_shortened = list(set([m.group(1) for m in matches])) | ||
if len(matches_shortened)==1: | ||
return matches_shortened[0] | ||
else: | ||
return "" | ||
else: | ||
return "" | ||
|
||
FB_data['combo_symbol'] = FB_data.apply(lambda x: choose_symbol(x.combo_synonyms), axis=1) | ||
|
||
FB_data.to_csv('tmp/FB_data_processed.tsv', sep='\t', index=None) |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.