Skip to content

Commit

Permalink
Merge pull request #405 from apriltuesday/consequence-transcripts
Browse files Browse the repository at this point in the history
Add transcripts to consequences
  • Loading branch information
apriltuesday authored Dec 19, 2023
2 parents 4d367cd + 4f8d1b1 commit 2b968b4
Show file tree
Hide file tree
Showing 17 changed files with 367 additions and 137 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
--mappings ${LATEST_MAPPINGS} \
-resume
```
You can use the `--include_transcripts` flag to also include transcript annotations with the functional consequences.

By default, the pipeline will download and annotate the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.

Expand Down
6 changes: 5 additions & 1 deletion bin/consequence_prediction/run_repeat_expansion_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
'--clinvar-xml', required=True,
help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
)
parser.add_argument(
'--include-transcripts', required=False, action='store_true',
help='Whether to include transcript IDs along with consequence terms'
)
parser.add_argument(
'--output-consequences', required=True,
help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
Expand All @@ -18,4 +22,4 @@
help='File to output full dataframe for subsequent analysis and debugging.'
)
args = parser.parse_args()
pipeline.main(args.clinvar_xml, args.output_consequences, args.output_dataframe)
pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences, args.output_dataframe)
8 changes: 6 additions & 2 deletions bin/consequence_prediction/run_structural_variants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
"""A wrapper script for running the repeat expansion pipeline."""
"""A wrapper script for running the structural variant pipeline."""

import argparse
from cmat.consequence_prediction.structural_variants import pipeline
Expand All @@ -9,10 +9,14 @@
'--clinvar-xml', required=True,
help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
)
parser.add_argument(
'--include-transcripts', required=False, action='store_true',
help='Whether to include transcript IDs along with consequence terms'
)
parser.add_argument(
'--output-consequences', required=True,
help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
)

args = parser.parse_args()
pipeline.main(args.clinvar_xml, args.output_consequences)
pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences)
3 changes: 2 additions & 1 deletion bin/evaluation/map_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def load_clinvar_data(clinvar_xml):
def main(clinvar_xml, output_file):
"""Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV."""
variants = load_clinvar_data(clinvar_xml)
annotated_variants = annotate_ensembl_gene_info(variants)
# Don't include transcripts for evaluation
annotated_variants = annotate_ensembl_gene_info(variants, include_transcripts=False)
annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False)


Expand Down
72 changes: 40 additions & 32 deletions cmat/consequence_prediction/common/biomart.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
(HGNC gene ID, gene symbol, RefSeq transcript ID) to Ensembl gene ID and name. The BioMart API is a bit quirky in that
it uses XML to specify the request.
"""

import re
from io import StringIO
import logging

Expand All @@ -15,24 +15,32 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the
# `key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask
# BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id"
# column, which contains stable Ensembl gene ID.
BIOMART_REQUEST_TEMPLATE = """http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Filter name = "{key_column}" value = "{identifier_list}"/>
<Attribute name = "{key_column}" />
<Attribute name = "{query_column}" />
</Dataset>
</Query>""".replace('\n', '')

# This is to avoid getting a "URL too long" exception.
MAX_REQUEST_LENGTH = 5000


def build_biomart_request_template(key_column, query_columns):
"""
The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the
`key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask
BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id"
column, which contains stable Ensembl gene ID.
Note `identifier_list` is left to be filled in later, to ensure the identifiers can be chunked appropriately.
"""
biomart_request_template = f"""http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Filter name = "{key_column}" value = "{{identifier_list}}"/>
<Attribute name = "{key_column}" />
"""
for query_column in query_columns:
biomart_request_template += f'<Attribute name = "{query_column}" />'
biomart_request_template += '</Dataset></Query>'
return re.sub(r'\n *', '', biomart_request_template)


# Since we are using an external API call here, the @retry decorator will ensure that any sporadic network errors will
# be handled and the request will be retried.
@retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger)
Expand Down Expand Up @@ -65,34 +73,34 @@ def split_into_chunks(lst, max_size, delimiter=','):
return chunks


def query_biomart(key_column, query_column, identifier_list):
def query_biomart(key_column, query_columns, identifier_list):
"""Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return
all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe.
all mappings from those identifiers to one ore more other column (`query_columns`) in form of a two-column Pandas
dataframe.
Args:
key_column: A tuple of key column names in Ensembl and in the resulting dataframe, e.g. ('hgnc_id', 'HGNC_ID')
query_column: A tuple of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID')
query_columns: A list of tuples of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID')
identifier_list: List of identifiers to query, e.g. ['HGNC:10548', 'HGNC:10560']
Returns:
A Pandas dataframe with two columns. It will contain at most one row per input identifier. The query column will
always contain a *list* to support the possibility of multiple mappings. In the example above, this will be:
A Pandas dataframe with two columns. Multiple mappings will be represented as separate rows. In the example
above, this will be:
HGNC_ID EnsemblGeneID
0 HGNC:10548 [ENSG00000124788]
1 HGNC:10560 [ENSG00000285258, ENSG00000163635]"""
0 HGNC:10548 ENSG00000124788
1 HGNC:10560 ENSG00000285258
2 HGNC:10560 ENSG00000163635"""
biomart_key_column, df_key_column = key_column
biomart_query_column, df_query_column = query_column
biomart_query_columns, df_query_columns = zip(*query_columns)
result = ''
for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(BIOMART_REQUEST_TEMPLATE)):
biomart_request_template = build_biomart_request_template(
key_column=biomart_key_column,
query_columns=biomart_query_columns
)
for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(biomart_request_template)):
logger.info(f'Processing chunk of {len(identifier_chunk)} records')
# Construct BioMart query from the template (see explanation above).
biomart_query = BIOMART_REQUEST_TEMPLATE.format(
key_column=biomart_key_column,
query_column=biomart_query_column,
identifier_list=','.join(identifier_chunk)
)
biomart_query = biomart_request_template.format(identifier_list=','.join(identifier_chunk))
result += process_biomart_request(biomart_query)
resulting_df = pd.read_table(StringIO(result), names=(df_key_column, df_query_column), dtype=str)
# Group all potential mappings into lists.
resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_column].apply(list).reset_index(name=df_query_column)
resulting_df = pd.read_table(StringIO(result), names=(df_key_column,)+df_query_columns, dtype=str)
return resulting_df
38 changes: 25 additions & 13 deletions cmat/consequence_prediction/common/vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,45 +65,57 @@ def load_consequence_severity_rank():
return {term: index for index, term in enumerate(get_severity_ranking())}


def most_severe_consequence(consequence_terms, consequence_term_severity_rank):
return min(consequence_terms, key=lambda term: consequence_term_severity_rank[term])
def most_severe_consequence(consequence_terms_with_transcripts, consequence_term_severity_rank):
return min(consequence_terms_with_transcripts,
key=lambda term_and_transcript: consequence_term_severity_rank[term_and_transcript[0]])[0]


def most_severe_consequence_per_gene(variant_identifier, consequences):
def most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts):
results = []
consequence_term_severity_rank = load_consequence_severity_rank()
consequences_per_gene = defaultdict(list)
for c in consequences:
key = (c['gene_id'], c.get('gene_symbol', ''))
consequences_per_gene[key].extend(term for term in c['consequence_terms'])
for (gene_id, gene_symbol), terms in consequences_per_gene.items():
most_severe_consequence_term = most_severe_consequence(terms, consequence_term_severity_rank)
results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
# Keep track of consequence term alongside transcript ID
consequences_per_gene[key].extend((term, c['transcript_id']) for term in c['consequence_terms'])
for (gene_id, gene_symbol), terms_with_transcripts in consequences_per_gene.items():
most_severe_consequence_term = most_severe_consequence(terms_with_transcripts, consequence_term_severity_rank)
# If we're including transcripts, need to include every transcript associated with the most severe consequence
if include_transcripts:
for term, transcript_id in terms_with_transcripts:
if term == most_severe_consequence_term:
results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term, transcript_id))
else:
results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
return results


def overall_most_severe_consequence(variant_identifier, consequences):
def overall_most_severe_consequence(variant_identifier, consequences, include_transcripts):
results = []
consequence_term_severity_rank = load_consequence_severity_rank()
# Flatten the list of consequence terms and find the most severe one
all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
all_consequence_terms = [(term, c['transcript_id']) for c in consequences for term in c['consequence_terms']]
most_severe_consequence_term = most_severe_consequence(all_consequence_terms, consequence_term_severity_rank)

# Keep only consequences which include the most severe consequence term.
for c in consequences:
if most_severe_consequence_term in c['consequence_terms']:
results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
if include_transcripts:
results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term, c['transcript_id']))
else:
results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
return results


def extract_consequences(vep_results, acceptable_biotypes):
def extract_consequences(vep_results, acceptable_biotypes, include_transcripts):
"""Given VEP results, return a list of consequences matching certain criteria.
Args:
vep_results: results obtained from VEP for a list of variants, in JSON format.
acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see
https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes
will be ignored.
include_transcripts: whether to include transcript IDs alongside consequence terms
"""
results_by_variant = defaultdict(list)
for result in vep_results:
Expand All @@ -120,11 +132,11 @@ def extract_consequences(vep_results, acceptable_biotypes):

# For genes overlapping the variant, we report the most severe consequence per gene.
if overlapping_consequences:
consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences)
consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences, include_transcripts)
# If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes
# associated with that consequence
else:
consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences)
consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts)
results_by_variant[variant_identifier].extend(consequences_for_variant)

return results_by_variant
Loading

0 comments on commit 2b968b4

Please sign in to comment.