Merge pull request #405 from apriltuesday/consequence-transcripts

Add transcripts to consequences
EBIvariation · Dec 19, 2023 · 2b968b4 · 2b968b4
2 parents 4d367cd + 4f8d1b1
commit 2b968b4
Show file tree

Hide file tree

Showing 17 changed files with 367 additions and 137 deletions.
diff --git a/README.md b/README.md
@@ -73,6 +73,7 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --mappings ${LATEST_MAPPINGS} \
   -resume
 ```
+You can use the `--include_transcripts` flag to also include transcript annotations with the functional consequences.
 
 By default, the pipeline will download and annotate the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
 

diff --git a/bin/consequence_prediction/run_repeat_expansion_variants.py b/bin/consequence_prediction/run_repeat_expansion_variants.py
@@ -9,6 +9,10 @@
     '--clinvar-xml', required=True,
     help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
 )
+parser.add_argument(
+    '--include-transcripts', required=False, action='store_true',
+    help='Whether to include transcript IDs along with consequence terms'
+)
 parser.add_argument(
     '--output-consequences', required=True,
     help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
@@ -18,4 +22,4 @@
     help='File to output full dataframe for subsequent analysis and debugging.'
 )
 args = parser.parse_args()
-pipeline.main(args.clinvar_xml, args.output_consequences, args.output_dataframe)
+pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences, args.output_dataframe)
diff --git a/bin/consequence_prediction/run_structural_variants.py b/bin/consequence_prediction/run_structural_variants.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""A wrapper script for running the repeat expansion pipeline."""
+"""A wrapper script for running the structural variant pipeline."""
 
 import argparse
 from cmat.consequence_prediction.structural_variants import pipeline
@@ -9,10 +9,14 @@
     '--clinvar-xml', required=True,
     help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
 )
+parser.add_argument(
+    '--include-transcripts', required=False, action='store_true',
+    help='Whether to include transcript IDs along with consequence terms'
+)
 parser.add_argument(
     '--output-consequences', required=True,
     help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
 )
 
 args = parser.parse_args()
-pipeline.main(args.clinvar_xml, args.output_consequences)
+pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences)
diff --git a/bin/evaluation/map_genes.py b/bin/evaluation/map_genes.py
@@ -53,7 +53,8 @@ def load_clinvar_data(clinvar_xml):
 def main(clinvar_xml, output_file):
     """Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV."""
     variants = load_clinvar_data(clinvar_xml)
-    annotated_variants = annotate_ensembl_gene_info(variants)
+    # Don't include transcripts for evaluation
+    annotated_variants = annotate_ensembl_gene_info(variants, include_transcripts=False)
     annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False)
 
 

diff --git a/cmat/consequence_prediction/common/biomart.py b/cmat/consequence_prediction/common/biomart.py
@@ -3,7 +3,7 @@
 (HGNC gene ID, gene symbol, RefSeq transcript ID) to Ensembl gene ID and name. The BioMart API is a bit quirky in that
 it uses XML to specify the request.
 """
-
+import re
 from io import StringIO
 import logging
 
@@ -15,24 +15,32 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-# The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the
-# `key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask
-# BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id"
-# column, which contains stable Ensembl gene ID.
-BIOMART_REQUEST_TEMPLATE = """http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE Query>
-<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
-    <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
-        <Filter name = "{key_column}" value = "{identifier_list}"/>
-        <Attribute name = "{key_column}" />
-        <Attribute name = "{query_column}" />
-    </Dataset>
-</Query>""".replace('\n', '')
-
 # This is to avoid getting a "URL too long" exception.
 MAX_REQUEST_LENGTH = 5000
 
 
+def build_biomart_request_template(key_column, query_columns):
+    """
+    The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the
+    `key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask
+    BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id"
+    column, which contains stable Ensembl gene ID.
+
+    Note `identifier_list` is left to be filled in later, to ensure the identifiers can be chunked appropriately.
+    """
+    biomart_request_template = f"""http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
+    <!DOCTYPE Query>
+    <Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
+        <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
+            <Filter name = "{key_column}" value = "{{identifier_list}}"/>
+            <Attribute name = "{key_column}" />
+    """
+    for query_column in query_columns:
+        biomart_request_template += f'<Attribute name = "{query_column}" />'
+    biomart_request_template += '</Dataset></Query>'
+    return re.sub(r'\n *', '', biomart_request_template)
+
+
 # Since we are using an external API call here, the @retry decorator will ensure that any sporadic network errors will
 # be handled and the request will be retried.
 @retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger)
@@ -65,34 +73,34 @@ def split_into_chunks(lst, max_size, delimiter=','):
     return chunks
 
 
-def query_biomart(key_column, query_column, identifier_list):
+def query_biomart(key_column, query_columns, identifier_list):
     """Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return
-    all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe.
+    all mappings from those identifiers to one ore more other column (`query_columns`) in form of a two-column Pandas
+    dataframe.
 
     Args:
         key_column: A tuple of key column names in Ensembl and in the resulting dataframe, e.g. ('hgnc_id', 'HGNC_ID')
-        query_column: A tuple of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID')
+        query_columns: A list of tuples of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID')
         identifier_list: List of identifiers to query, e.g. ['HGNC:10548', 'HGNC:10560']
 
     Returns:
-        A Pandas dataframe with two columns. It will contain at most one row per input identifier. The query column will
-        always contain a *list* to support the possibility of multiple mappings. In the example above, this will be:
+        A Pandas dataframe with two columns. Multiple mappings will be represented as separate rows. In the example
+        above, this will be:
                HGNC_ID      EnsemblGeneID
-            0  HGNC:10548   [ENSG00000124788]
-            1  HGNC:10560   [ENSG00000285258, ENSG00000163635]"""
+            0  HGNC:10548   ENSG00000124788
+            1  HGNC:10560   ENSG00000285258
+            2  HGNC:10560   ENSG00000163635"""
     biomart_key_column, df_key_column = key_column
-    biomart_query_column, df_query_column = query_column
+    biomart_query_columns, df_query_columns = zip(*query_columns)
     result = ''
-    for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(BIOMART_REQUEST_TEMPLATE)):
+    biomart_request_template = build_biomart_request_template(
+        key_column=biomart_key_column,
+        query_columns=biomart_query_columns
+    )
+    for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(biomart_request_template)):
         logger.info(f'Processing chunk of {len(identifier_chunk)} records')
         # Construct BioMart query from the template (see explanation above).
-        biomart_query = BIOMART_REQUEST_TEMPLATE.format(
-            key_column=biomart_key_column,
-            query_column=biomart_query_column,
-            identifier_list=','.join(identifier_chunk)
-        )
+        biomart_query = biomart_request_template.format(identifier_list=','.join(identifier_chunk))
         result += process_biomart_request(biomart_query)
-    resulting_df = pd.read_table(StringIO(result), names=(df_key_column, df_query_column), dtype=str)
-    # Group all potential mappings into lists.
-    resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_column].apply(list).reset_index(name=df_query_column)
+    resulting_df = pd.read_table(StringIO(result), names=(df_key_column,)+df_query_columns, dtype=str)
     return resulting_df
diff --git a/cmat/consequence_prediction/common/vep.py b/cmat/consequence_prediction/common/vep.py
@@ -65,45 +65,57 @@ def load_consequence_severity_rank():
     return {term: index for index, term in enumerate(get_severity_ranking())}
 
 
-def most_severe_consequence(consequence_terms, consequence_term_severity_rank):
-    return min(consequence_terms, key=lambda term: consequence_term_severity_rank[term])
+def most_severe_consequence(consequence_terms_with_transcripts, consequence_term_severity_rank):
+    return min(consequence_terms_with_transcripts,
+               key=lambda term_and_transcript: consequence_term_severity_rank[term_and_transcript[0]])[0]
 
 
-def most_severe_consequence_per_gene(variant_identifier, consequences):
+def most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts):
     results = []
     consequence_term_severity_rank = load_consequence_severity_rank()
     consequences_per_gene = defaultdict(list)
     for c in consequences:
         key = (c['gene_id'], c.get('gene_symbol', ''))
-        consequences_per_gene[key].extend(term for term in c['consequence_terms'])
-    for (gene_id, gene_symbol), terms in consequences_per_gene.items():
-        most_severe_consequence_term = most_severe_consequence(terms, consequence_term_severity_rank)
-        results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
+        # Keep track of consequence term alongside transcript ID
+        consequences_per_gene[key].extend((term, c['transcript_id']) for term in c['consequence_terms'])
+    for (gene_id, gene_symbol), terms_with_transcripts in consequences_per_gene.items():
+        most_severe_consequence_term = most_severe_consequence(terms_with_transcripts, consequence_term_severity_rank)
+        # If we're including transcripts, need to include every transcript associated with the most severe consequence
+        if include_transcripts:
+            for term, transcript_id in terms_with_transcripts:
+                if term == most_severe_consequence_term:
+                    results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term, transcript_id))
+        else:
+            results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
     return results
 
 
-def overall_most_severe_consequence(variant_identifier, consequences):
+def overall_most_severe_consequence(variant_identifier, consequences, include_transcripts):
     results = []
     consequence_term_severity_rank = load_consequence_severity_rank()
     # Flatten the list of consequence terms and find the most severe one
-    all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
+    all_consequence_terms = [(term, c['transcript_id']) for c in consequences for term in c['consequence_terms']]
     most_severe_consequence_term = most_severe_consequence(all_consequence_terms, consequence_term_severity_rank)
 
     # Keep only consequences which include the most severe consequence term.
     for c in consequences:
         if most_severe_consequence_term in c['consequence_terms']:
-            results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
+            if include_transcripts:
+                results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term, c['transcript_id']))
+            else:
+                results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
     return results
 
 
-def extract_consequences(vep_results, acceptable_biotypes):
+def extract_consequences(vep_results, acceptable_biotypes, include_transcripts):
     """Given VEP results, return a list of consequences matching certain criteria.
 
     Args:
         vep_results: results obtained from VEP for a list of variants, in JSON format.
         acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see
             https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes
             will be ignored.
+        include_transcripts: whether to include transcript IDs alongside consequence terms
     """
     results_by_variant = defaultdict(list)
     for result in vep_results:
@@ -120,11 +132,11 @@ def extract_consequences(vep_results, acceptable_biotypes):
 
         # For genes overlapping the variant, we report the most severe consequence per gene.
         if overlapping_consequences:
-            consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences)
+            consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences, include_transcripts)
         # If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes
         # associated with that consequence
         else:
-            consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences)
+            consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts)
         results_by_variant[variant_identifier].extend(consequences_for_variant)
 
     return results_by_variant