From 09b9c7cf9e72dd8cb2829e0b2ec047545917a9d0 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Thu, 30 Nov 2023 15:18:46 +0000
Subject: [PATCH 1/9] add transcript flag for VEP

---
 .../run_structural_variants.py                |  2 +-
 cmat/consequence_prediction/common/biomart.py |  1 +
 cmat/consequence_prediction/common/vep.py     | 38 ++++++----
 .../snp_indel_variants/pipeline.py            | 19 +++--
 .../consequence_prediction/common/test_vep.py | 75 +++++++++++++++++--
 5 files changed, 108 insertions(+), 27 deletions(-)

diff --git a/bin/consequence_prediction/run_structural_variants.py b/bin/consequence_prediction/run_structural_variants.py
index ad5d8207..e58cf57f 100755
--- a/bin/consequence_prediction/run_structural_variants.py
+++ b/bin/consequence_prediction/run_structural_variants.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""A wrapper script for running the repeat expansion pipeline."""
+"""A wrapper script for running the structural variant pipeline."""
 
 import argparse
 from cmat.consequence_prediction.structural_variants import pipeline
diff --git a/cmat/consequence_prediction/common/biomart.py b/cmat/consequence_prediction/common/biomart.py
index 2bd3ac55..3152ab58 100644
--- a/cmat/consequence_prediction/common/biomart.py
+++ b/cmat/consequence_prediction/common/biomart.py
@@ -65,6 +65,7 @@ def split_into_chunks(lst, max_size, delimiter=','):
     return chunks
 
 
+# TODO flag to get transcripts
 def query_biomart(key_column, query_column, identifier_list):
     """Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return
     all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe.
diff --git a/cmat/consequence_prediction/common/vep.py b/cmat/consequence_prediction/common/vep.py
index 6a8b65c5..eae1ce74 100755
--- a/cmat/consequence_prediction/common/vep.py
+++ b/cmat/consequence_prediction/common/vep.py
@@ -65,38 +65,49 @@ def load_consequence_severity_rank():
     return {term: index for index, term in enumerate(get_severity_ranking())}
 
 
-def most_severe_consequence(consequence_terms, consequence_term_severity_rank):
-    return min(consequence_terms, key=lambda term: consequence_term_severity_rank[term])
+def most_severe_consequence(consequence_terms_with_transcripts, consequence_term_severity_rank):
+    return min(consequence_terms_with_transcripts,
+               key=lambda term_and_transcript: consequence_term_severity_rank[term_and_transcript[0]])[0]
 
 
-def most_severe_consequence_per_gene(variant_identifier, consequences):
+def most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts):
     results = []
     consequence_term_severity_rank = load_consequence_severity_rank()
     consequences_per_gene = defaultdict(list)
     for c in consequences:
         key = (c['gene_id'], c.get('gene_symbol', ''))
-        consequences_per_gene[key].extend(term for term in c['consequence_terms'])
-    for (gene_id, gene_symbol), terms in consequences_per_gene.items():
-        most_severe_consequence_term = most_severe_consequence(terms, consequence_term_severity_rank)
-        results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
+        # Keep track of consequence term alongside transcript ID
+        consequences_per_gene[key].extend((term, c['transcript_id']) for term in c['consequence_terms'])
+    for (gene_id, gene_symbol), terms_with_transcripts in consequences_per_gene.items():
+        most_severe_consequence_term = most_severe_consequence(terms_with_transcripts, consequence_term_severity_rank)
+        # If we're including transcripts, need to include every transcript associated with the most severe consequence
+        if include_transcripts:
+            for term, transcript_id in terms_with_transcripts:
+                if term == most_severe_consequence_term:
+                    results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term, transcript_id))
+        else:
+            results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term))
     return results
 
 
-def overall_most_severe_consequence(variant_identifier, consequences):
+def overall_most_severe_consequence(variant_identifier, consequences, include_transcripts):
     results = []
     consequence_term_severity_rank = load_consequence_severity_rank()
     # Flatten the list of consequence terms and find the most severe one
-    all_consequence_terms = [term for c in consequences for term in c['consequence_terms']]
+    all_consequence_terms = [(term, c['transcript_id']) for c in consequences for term in c['consequence_terms']]
     most_severe_consequence_term = most_severe_consequence(all_consequence_terms, consequence_term_severity_rank)
 
     # Keep only consequences which include the most severe consequence term.
     for c in consequences:
         if most_severe_consequence_term in c['consequence_terms']:
-            results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
+            if include_transcripts:
+                results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term, c['transcript_id']))
+            else:
+                results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term))
     return results
 
 
-def extract_consequences(vep_results, acceptable_biotypes):
+def extract_consequences(vep_results, acceptable_biotypes, include_transcripts):
     """Given VEP results, return a list of consequences matching certain criteria.
 
     Args:
@@ -104,6 +115,7 @@ def extract_consequences(vep_results, acceptable_biotypes):
         acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see
             https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes
             will be ignored.
+        include_transcripts: whether to include transcript IDs alongside consequence terms
     """
     results_by_variant = defaultdict(list)
     for result in vep_results:
@@ -120,11 +132,11 @@ def extract_consequences(vep_results, acceptable_biotypes):
 
         # For genes overlapping the variant, we report the most severe consequence per gene.
         if overlapping_consequences:
-            consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences)
+            consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences, include_transcripts)
         # If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes
         # associated with that consequence
         else:
-            consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences)
+            consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts)
         results_by_variant[variant_identifier].extend(consequences_for_variant)
 
     return results_by_variant
diff --git a/cmat/consequence_prediction/snp_indel_variants/pipeline.py b/cmat/consequence_prediction/snp_indel_variants/pipeline.py
index cf3a3951..a33a0c6a 100755
--- a/cmat/consequence_prediction/snp_indel_variants/pipeline.py
+++ b/cmat/consequence_prediction/snp_indel_variants/pipeline.py
@@ -15,6 +15,10 @@
 from cmat.consequence_prediction.common.vep import query_vep, extract_consequences, deduplicate_list
 
 parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--include-transcripts', required=False, action='store_true',
+    help='Whether to include transcript IDs along with consequence terms'
+)
 
 logging.basicConfig()
 logger = logging.getLogger('consequence_mapping')
@@ -106,14 +110,15 @@ def get_variants_without_consequences(results_by_variant):
     })
 
 
-def process_variants(variants):
+def process_variants(variants, include_transcripts):
     """Given a list of variant IDs, return a list of consequence types (each including Ensembl gene name & ID and a
     functional consequence code) for a given variant.
     """
     # Query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts
     # up to a standard distance (5000 nucleotides either way, which is default for VEP) from the variant.
     vep_results = query_vep(variants=variants)
-    results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
+    results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'},
+                                              include_transcripts=include_transcripts)
 
     # See if there are variants with no consequences up to the default distance
     variants_without_consequences = get_variants_without_consequences(results_by_variant)
@@ -136,9 +141,13 @@ def main():
     variants_to_query = [colon_based_id_to_vep_id(v) for v in sys.stdin.read().splitlines()]
 
     # Query VEP with all variants at once (for the purpose of efficiency), print out the consequences to STDOUT.
-    consequences = process_variants(variants_to_query)
-    for variant_id, gene_id, gene_symbol, consequence_term in consequences:
-        print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term]))
+    consequences = process_variants(variants_to_query, args.include_transcripts)
+    if args.include_transcripts:
+        for variant_id, gene_id, gene_symbol, consequence_term, transcript_id in consequences:
+            print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term, transcript_id]))
+    else:
+        for variant_id, gene_id, gene_symbol, consequence_term in consequences:
+            print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term]))
 
     logger.info('Successfully processed {} variants'.format(len(variants_to_query)))
 
diff --git a/tests/consequence_prediction/common/test_vep.py b/tests/consequence_prediction/common/test_vep.py
index c7d5581a..dccc0891 100644
--- a/tests/consequence_prediction/common/test_vep.py
+++ b/tests/consequence_prediction/common/test_vep.py
@@ -11,35 +11,48 @@ def get_vep_results():
                     'gene_symbol': 'MASTL',
                     'gene_id': 'ENSG00000120539',
                     'biotype': 'protein_coding',
-                    'consequence_terms': ['missense_variant']
+                    'consequence_terms': ['missense_variant'],
+                    'transcript_id': 'ENST00000342386'
+                },
+                {  # same consequence and gene, different transcript
+                    'gene_symbol': 'MASTL',
+                    'gene_id': 'ENSG00000120539',
+                    'biotype': 'protein_coding',
+                    'consequence_terms': ['missense_variant'],
+                    'transcript_id': 'ENST00000477034'
                 },
                 {  # missing gene symbol
                     'gene_id': 'ENSG00000120538',
                     'biotype': 'protein_coding',
-                    'consequence_terms': ['missense_variant']
+                    'consequence_terms': ['missense_variant'],
+                    'transcript_id': 'ENST00000342386'
                 },
                 {  # less severe consequence on same gene
                     'gene_id': 'ENSG00000120538',
                     'biotype': 'protein_coding',
-                    'consequence_terms': ['stop_retained_variant']
+                    'consequence_terms': ['stop_retained_variant'],
+                    'transcript_id': 'ENST00000342386'
                 },
                 {  # less severe consequence on distinct gene
                     'gene_id': 'ENSG00000120537',
                     'biotype': 'protein_coding',
-                    'consequence_terms': ['stop_retained_variant']
+                    'consequence_terms': ['stop_retained_variant'],
+                    'transcript_id': 'ENST00000342385'
                 },
                 {  # non-overlapping consequences
                     'gene_symbol': 'ACBD5',
                     'gene_id': 'ENSG00000107897',
                     'biotype': 'protein_coding',
                     'distance': 5,
-                    'consequence_terms': ['3_prime_UTR_variant']
+                    'consequence_terms': ['3_prime_UTR_variant'],
+                    'transcript_id': 'ENST00000676731'
                 },
                 {  # other biotypes
                     'gene_symbol': 'MIR125A',
                     'gene_id': 'ENSG00000208008',
                     'biotype': 'miRNA',
-                    'consequence_terms': ['missense_variant']
+                    'consequence_terms': ['missense_variant'],
+                    'transcript_id': 'ENST00000385273'
                 }
             ]
         }
@@ -52,6 +65,7 @@ def test_extract_consequences():
     results = extract_consequences(
         vep_results=vep_results,
         acceptable_biotypes=['protein_coding'],
+        include_transcripts=False
     )
     assert results == {'10 27169969 . C A': [
         ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'),
@@ -60,23 +74,53 @@ def test_extract_consequences():
     ]}
 
 
+def test_extract_consequences_with_transcripts():
+    """Verifies behaviour of extract_consequences with transcript flag set."""
+    vep_results = get_vep_results()
+    results = extract_consequences(
+        vep_results=vep_results,
+        acceptable_biotypes=['protein_coding'],
+        include_transcripts=True
+    )
+    assert results == {'10 27169969 . C A': [
+        ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000342386'),
+        ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000477034'),
+        ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant', 'ENST00000342386'),
+        ('10 27169969 . C A', 'ENSG00000120537', '', 'stop_retained_variant', 'ENST00000342385'),
+    ]}
+
+
 def test_overall_most_severe_consequence():
     vep_results = get_vep_results()
     variant_identifier = vep_results[0]['input']
     consequences = vep_results[0]['transcript_consequences']
-    results = overall_most_severe_consequence(variant_identifier, consequences)
+    results = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts=False)
     assert results == [
+        ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'),
         ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'),
         ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant'),
         ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant')
     ]
 
 
+def test_overall_most_severe_consequence_with_transcripts():
+    vep_results = get_vep_results()
+    variant_identifier = vep_results[0]['input']
+    consequences = vep_results[0]['transcript_consequences']
+    results = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts=True)
+    assert results == [
+        ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000342386'),
+        ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000477034'),
+        ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant', 'ENST00000342386'),
+        ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant', 'ENST00000385273')
+    ]
+
+
 def test_most_severe_consequence_per_gene():
     vep_results = get_vep_results()
     variant_identifier = vep_results[0]['input']
     consequences = vep_results[0]['transcript_consequences']
-    results = most_severe_consequence_per_gene(variant_identifier, consequences)
+    results = most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts=False)
     assert results == [
         ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'),
         ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant'),
@@ -84,3 +128,18 @@ def test_most_severe_consequence_per_gene():
         ('10 27169969 . C A', 'ENSG00000107897', 'ACBD5', '3_prime_UTR_variant'),
         ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant')
     ]
+
+
+def test_most_severe_consequence_per_gene_with_transcripts():
+    vep_results = get_vep_results()
+    variant_identifier = vep_results[0]['input']
+    consequences = vep_results[0]['transcript_consequences']
+    results = most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts=True)
+    assert results == [
+        ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000342386'),
+        ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000477034'),
+        ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant', 'ENST00000342386'),
+        ('10 27169969 . C A', 'ENSG00000120537', '', 'stop_retained_variant', 'ENST00000342385'),
+        ('10 27169969 . C A', 'ENSG00000107897', 'ACBD5', '3_prime_UTR_variant', 'ENST00000676731'),
+        ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant', 'ENST00000385273')
+    ]

From 1cb35be29520273ae4c17a855c01ae4e0cd6ae39 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Fri, 1 Dec 2023 16:12:37 +0000
Subject: [PATCH 2/9] WIP on getting transcripts from biomart

---
 cmat/consequence_prediction/common/biomart.py | 62 +++++++++++--------
 .../repeat_expansion_variants/pipeline.py     | 24 ++++---
 .../structural_variants/pipeline.py           | 18 ++++--
 .../common/test_biomart.py                    | 12 ++++
 .../test_repeat_pipeline.py                   |  2 +-
 .../test_structural_pipeline.py               | 10 ++-
 6 files changed, 86 insertions(+), 42 deletions(-)
 create mode 100644 tests/consequence_prediction/common/test_biomart.py

diff --git a/cmat/consequence_prediction/common/biomart.py b/cmat/consequence_prediction/common/biomart.py
index 3152ab58..fd665159 100644
--- a/cmat/consequence_prediction/common/biomart.py
+++ b/cmat/consequence_prediction/common/biomart.py
@@ -15,24 +15,32 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-# The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the
-# `key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask
-# BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id"
-# column, which contains stable Ensembl gene ID.
-BIOMART_REQUEST_TEMPLATE = """http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE Query>
-<Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
-    <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
-        <Filter name = "{key_column}" value = "{identifier_list}"/>
-        <Attribute name = "{key_column}" />
-        <Attribute name = "{query_column}" />
-    </Dataset>
-</Query>""".replace('\n', '')
-
 # This is to avoid getting a "URL too long" exception.
 MAX_REQUEST_LENGTH = 5000
 
 
+def build_biomart_request_template(key_column, query_columns):
+    """
+    The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the
+    `key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask
+    BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id"
+    column, which contains stable Ensembl gene ID.
+
+    Note `identifier_list` is left to be filled in later, to ensure the identifiers can be chunked appropriately.
+    """
+    biomart_request_template = f"""http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?>
+    <!DOCTYPE Query>
+    <Query virtualSchemaName="default" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
+        <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
+            <Filter name = "{key_column}" value = "{{identifier_list}}"/>
+            <Attribute name = "{key_column}" />
+    """
+    for query_column in query_columns:
+        biomart_request_template += f'<Attribute name = "{query_column}" />'
+    biomart_request_template += '</Dataset></Query>'
+    return biomart_request_template.replace('\n', '')
+
+
 # Since we are using an external API call here, the @retry decorator will ensure that any sporadic network errors will
 # be handled and the request will be retried.
 @retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger)
@@ -65,14 +73,13 @@ def split_into_chunks(lst, max_size, delimiter=','):
     return chunks
 
 
-# TODO flag to get transcripts
-def query_biomart(key_column, query_column, identifier_list):
+def query_biomart(key_column, query_columns, identifier_list):
     """Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return
     all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe.
 
     Args:
         key_column: A tuple of key column names in Ensembl and in the resulting dataframe, e.g. ('hgnc_id', 'HGNC_ID')
-        query_column: A tuple of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID')
+        query_columns: A list of tuples of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID')
         identifier_list: List of identifiers to query, e.g. ['HGNC:10548', 'HGNC:10560']
 
     Returns:
@@ -82,18 +89,21 @@ def query_biomart(key_column, query_column, identifier_list):
             0  HGNC:10548   [ENSG00000124788]
             1  HGNC:10560   [ENSG00000285258, ENSG00000163635]"""
     biomart_key_column, df_key_column = key_column
-    biomart_query_column, df_query_column = query_column
+    biomart_query_columns, df_query_columns = zip(*query_columns)
     result = ''
-    for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(BIOMART_REQUEST_TEMPLATE)):
+    biomart_request_template = build_biomart_request_template(
+        key_column=biomart_key_column,
+        query_columns=biomart_query_columns
+    )
+    for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(biomart_request_template)):
         logger.info(f'Processing chunk of {len(identifier_chunk)} records')
         # Construct BioMart query from the template (see explanation above).
-        biomart_query = BIOMART_REQUEST_TEMPLATE.format(
-            key_column=biomart_key_column,
-            query_column=biomart_query_column,
-            identifier_list=','.join(identifier_chunk)
-        )
+        biomart_query = biomart_request_template.format(identifier_list=','.join(identifier_chunk))
         result += process_biomart_request(biomart_query)
-    resulting_df = pd.read_table(StringIO(result), names=(df_key_column, df_query_column), dtype=str)
+    resulting_df = pd.read_table(StringIO(result), names=(df_key_column,)+df_query_columns, dtype=str)
     # Group all potential mappings into lists.
-    resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_column].apply(list).reset_index(name=df_query_column)
+    # TODO this associates a list of results with each identifier, which does not work
+    #  for both genes and transcripts as it loses the association
+    #  Maybe see if can push this step one level up...
+    # resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_columns].apply(list).reset_index(names=df_query_columns)
     return resulting_df
diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
index cdd98837..388d0d10 100644
--- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
+++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
@@ -92,7 +92,7 @@ def load_clinvar_data(clinvar_xml):
     return variants.sort_values(by=['Name']), stats
 
 
-def annotate_ensembl_gene_info(variants):
+def annotate_ensembl_gene_info(variants, include_transcripts):
     """Annotate the `variants` dataframe with information about Ensembl gene ID and name"""
 
     # Ensembl gene ID can be determined using three ways, listed in the order of decreasing priority. Having multiple
@@ -103,6 +103,10 @@ def annotate_ensembl_gene_info(variants):
         ('GeneSymbol',   'external_gene_name', lambda i: i != '-'),
         ('TranscriptID',        'refseq_mrna', lambda i: pd.notnull(i)),
     )
+    query_columns = [('ensembl_gene_id', 'EnsemblGeneID')]
+    if include_transcripts:
+        query_columns.append(('ensembl_transcript_id', 'EnsemblTranscriptID'))
+
     # This copy of the dataframe is required to facilitate filling in data using the `combine_first()` method. This
     # allows us to apply priorities: e.g., if a gene ID was already populated using HGNC_ID, it will not be overwritten
     # by a gene ID determined using GeneSymbol.
@@ -117,9 +121,11 @@ def annotate_ensembl_gene_info(variants):
         # Query BioMart for Ensembl Gene IDs
         annotation_info = biomart.query_biomart(
             key_column=(column_name_in_biomart, column_name_in_dataframe),
-            query_column=('ensembl_gene_id', 'EnsemblGeneID'),
+            query_columns=query_columns,
             identifier_list=identifiers_to_query,
         )
+        # TODO collapse to list (?)
+
         # Make note where the annotations came from
         annotation_info['GeneAnnotationSource'] = column_name_in_dataframe
         # Combine the information we received with the *original* dataframe (a copy made before any iterations of this
@@ -127,16 +133,19 @@ def annotate_ensembl_gene_info(variants):
         annotation_df = pd.merge(variants_original, annotation_info, on=column_name_in_dataframe, how='left')
         # Update main dataframe with the new values. This replaces the NaN values in the dataframe with the ones
         # available in another dataframe we just created, `annotation_df`.
+        # TODO check this combine step if the previous info is not in lists
         variants = variants \
             .set_index([column_name_in_dataframe]) \
             .combine_first(annotation_df.set_index([column_name_in_dataframe]))
 
+    # TODO check all this exploding stuff for transcripts
     # Reset index to default
     variants.reset_index(inplace=True)
     # Some records are being annotated to multiple Ensembl genes. For example, HGNC:10560 is being resolved to
     # ENSG00000285258 and ENSG00000163635. We need to explode dataframe by that column.
     variants = variants.explode('EnsemblGeneID')
 
+    # TODO another query to biomart, this one should not need to change
     # Based on the Ensembl gene ID, annotate (1) gene name and (2) which chromosome it is on
     gene_query_columns = (
         ('external_gene_name', 'EnsemblGeneName'),
@@ -145,7 +154,7 @@ def annotate_ensembl_gene_info(variants):
     for column_name_in_biomart, column_name_in_dataframe in gene_query_columns:
         annotation_info = biomart.query_biomart(
             key_column=('ensembl_gene_id', 'EnsemblGeneID'),
-            query_column=(column_name_in_biomart, column_name_in_dataframe),
+            query_columns=[(column_name_in_biomart, column_name_in_dataframe)],
             identifier_list=sorted({str(i) for i in variants['EnsemblGeneID'] if str(i).startswith('ENSG')}),
         )
         variants = pd.merge(variants, annotation_info, on='EnsemblGeneID', how='left')
@@ -184,7 +193,7 @@ def generate_consequences_file(consequences, output_consequences):
     logger.info(f'  {sum(consequences.RepeatType == "short_tandem_repeat_expansion")} short tandem repeat expansion')
 
 
-def extract_consequences(variants):
+def extract_consequences(variants, include_transcripts):
     # Generate consequences table
     consequences = variants[variants['RecordIsComplete']] \
         .groupby(['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName'], group_keys=False)['RepeatType'] \
@@ -217,11 +226,12 @@ def generate_all_variants_file(output_dataframe, variants):
     variants.to_csv(output_dataframe, sep='\t', index=False)
 
 
-def main(clinvar_xml, output_consequences=None, output_dataframe=None):
+def main(clinvar_xml, include_transcripts, output_consequences=None, output_dataframe=None):
     """Process data and generate output files.
 
     Args:
         clinvar_xml: filepath to the ClinVar XML file.
+        include_transcripts:
         output_consequences: filepath to the output file with variant consequences. The file uses a 6-column format
             compatible with the VEP mapping pipeline (see /consequence_prediction/README.md).
         output_dataframe: filepath to the output file with the full dataframe used in the analysis. This will contain
@@ -247,7 +257,7 @@ def main(clinvar_xml, output_consequences=None, output_dataframe=None):
         return None
 
     logger.info('Match each record to Ensembl gene ID and name')
-    variants = annotate_ensembl_gene_info(variants)
+    variants = annotate_ensembl_gene_info(variants, include_transcripts)
 
     logger.info('Determine variant type and whether the record is complete')
     variants = variants.apply(lambda row: determine_complete(row), axis=1)
@@ -255,7 +265,7 @@ def main(clinvar_xml, output_consequences=None, output_dataframe=None):
     logger.info('Postprocess data and output the two final tables')
     if output_dataframe is not None:
         generate_all_variants_file(output_dataframe, variants)
-    consequences = extract_consequences(variants)
+    consequences = extract_consequences(variants, include_transcripts)
     if output_consequences is not None:
         generate_consequences_file(consequences, output_consequences)
 
diff --git a/cmat/consequence_prediction/structural_variants/pipeline.py b/cmat/consequence_prediction/structural_variants/pipeline.py
index cca67c71..eff2c814 100644
--- a/cmat/consequence_prediction/structural_variants/pipeline.py
+++ b/cmat/consequence_prediction/structural_variants/pipeline.py
@@ -79,20 +79,26 @@ def generate_consequences_file(consequences, output_consequences):
     consequences.to_csv(output_consequences, sep='\t', index=False, header=False)
 
 
-def main(clinvar_xml, output_consequences=None):
+def main(clinvar_xml, include_transcripts, output_consequences=None):
     vep_results = get_vep_results(clinvar_xml)
-    results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'})
+    results_by_variant = extract_consequences(
+        vep_results=vep_results,
+        acceptable_biotypes={'protein_coding', 'miRNA'},
+        include_transcripts=include_transcripts)
     variant_data = []
     for variant_id, variant_consequences in results_by_variant.items():
         for consequence_to_yield in deduplicate_list(variant_consequences):
-            variant_id, gene_id, gene_symbol, consequence_term = consequence_to_yield
+            # * operator lets us keep all the annotations, regardless of whether transcript is there or not
+            variant_id, *annotations = consequence_to_yield
             # variant_id here is the entire input to VEP, we only want the HGVS that we passed as an identifier
             hgvs_id = variant_id.split()[-1]
-            variant_data.append((hgvs_id, gene_id, gene_symbol, consequence_term))
+            variant_data.append([hgvs_id] + annotations)
 
     # Return as a dataframe to be compatible with repeat expansion pipeline
-    consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID',
-                                                       'EnsemblGeneName', 'ConsequenceTerm'))
+    column_names = ['VariantID', 'EnsemblGeneID', 'EnsemblGeneName', 'ConsequenceTerm']
+    if include_transcripts:
+        column_names.append('EnsemblTranscriptID')
+    consequences = pd.DataFrame(variant_data, columns=column_names)
     if output_consequences is not None:
         generate_consequences_file(consequences, output_consequences)
     return consequences
diff --git a/tests/consequence_prediction/common/test_biomart.py b/tests/consequence_prediction/common/test_biomart.py
new file mode 100644
index 00000000..6421b462
--- /dev/null
+++ b/tests/consequence_prediction/common/test_biomart.py
@@ -0,0 +1,12 @@
+from cmat.consequence_prediction.common.biomart import query_biomart
+
+
+def test_query_biomart():
+    key_column = ('hgnc_id', 'HGNC_ID')
+    query_columns = [('ensembl_gene_id', 'EnsemblGeneID'), ('ensembl_transcript_id', 'EnsemblTranscriptID')]
+    identifier_list = ['HGNC:10548', 'HGNC:10560']
+    result_df = query_biomart(key_column, query_columns, identifier_list)
+    assert set(result_df.columns) == {'HGNC_ID', 'EnsemblGeneID', 'EnsemblTranscriptID'}
+    assert len(result_df) == 35
+    assert set(result_df['HGNC_ID']) == set(identifier_list)
+    assert set(result_df['EnsemblGeneID']) == {'ENSG00000163635', 'ENSG00000124788'}
diff --git a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
index 2dc466e1..27384ee2 100644
--- a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
+++ b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
@@ -24,7 +24,7 @@ def run_pipeline(resource_name):
     """Runs the pipeline on a given test resource and returns the output consequences as a list of lists."""
     input_filename = get_test_resource(resource_name)
     output_consequences, output_dataframe = [tempfile.NamedTemporaryFile(delete=False) for _ in range(2)]
-    pipeline.main(input_filename, output_consequences.name, output_dataframe.name)
+    pipeline.main(input_filename, False, output_consequences.name, output_dataframe.name)
     consequences = [line.rstrip().split('\t') for line in open(output_consequences.name).read().splitlines()]
     for temp_file in (output_consequences, output_dataframe):
         os.remove(temp_file.name)
diff --git a/tests/consequence_prediction/structural_variants/test_structural_pipeline.py b/tests/consequence_prediction/structural_variants/test_structural_pipeline.py
index b22e70d4..42edaea1 100644
--- a/tests/consequence_prediction/structural_variants/test_structural_pipeline.py
+++ b/tests/consequence_prediction/structural_variants/test_structural_pipeline.py
@@ -16,10 +16,10 @@ def get_test_resource(resource_name):
     return os.path.join(module_directory, 'resources', resource_name)
 
 
-def run_pipeline(resource_name):
+def run_pipeline(resource_name, include_transcripts=False):
     """Runs the pipeline on a given test resource and returns the output consequences as a list of lists."""
     input_filename = get_test_resource(resource_name)
-    consequences_df = pipeline.main(input_filename)
+    consequences_df = pipeline.main(input_filename, include_transcripts)
     consequences = [[col for col in row] for row in consequences_df.itertuples(index=False)]
     return consequences
 
@@ -35,6 +35,12 @@ def test_successful_run():
     ])
 
 
+def test_successful_run_with_transcripts():
+    results = run_pipeline('precise_genomic.xml.gz', include_transcripts=True)
+    assert len(results) == 28
+    assert ['NC_000001.11:g.25271785_25329047del', 'ENSG00000187010', 'RHD', 'stop_lost', 'ENST00000454452'] in results
+
+
 def test_has_complete_coordinates():
     assert run_pipeline('complete_coordinates.xml.gz') == []
 

From 21498752dffce77a494629f8bdca9d31111f6781 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 5 Dec 2023 17:52:23 +0000
Subject: [PATCH 3/9] add transcript flag in repeat expansion pipeline

---
 cmat/consequence_prediction/common/biomart.py | 21 +++---
 .../repeat_expansion_variants/pipeline.py     | 75 +++++++++----------
 .../test_repeat_pipeline.py                   | 35 +++++++++
 3 files changed, 79 insertions(+), 52 deletions(-)

diff --git a/cmat/consequence_prediction/common/biomart.py b/cmat/consequence_prediction/common/biomart.py
index fd665159..a571597a 100644
--- a/cmat/consequence_prediction/common/biomart.py
+++ b/cmat/consequence_prediction/common/biomart.py
@@ -3,7 +3,7 @@
 (HGNC gene ID, gene symbol, RefSeq transcript ID) to Ensembl gene ID and name. The BioMart API is a bit quirky in that
 it uses XML to specify the request.
 """
-
+import re
 from io import StringIO
 import logging
 
@@ -38,7 +38,7 @@ def build_biomart_request_template(key_column, query_columns):
     for query_column in query_columns:
         biomart_request_template += f'<Attribute name = "{query_column}" />'
     biomart_request_template += '</Dataset></Query>'
-    return biomart_request_template.replace('\n', '')
+    return re.sub(r'\n *', '', biomart_request_template)
 
 
 # Since we are using an external API call here, the @retry decorator will ensure that any sporadic network errors will
@@ -75,7 +75,8 @@ def split_into_chunks(lst, max_size, delimiter=','):
 
 def query_biomart(key_column, query_columns, identifier_list):
     """Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return
-    all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe.
+    all mappings from those identifiers to one ore more other column (`query_columns`) in form of a two-column Pandas
+    dataframe.
 
     Args:
         key_column: A tuple of key column names in Ensembl and in the resulting dataframe, e.g. ('hgnc_id', 'HGNC_ID')
@@ -83,11 +84,12 @@ def query_biomart(key_column, query_columns, identifier_list):
         identifier_list: List of identifiers to query, e.g. ['HGNC:10548', 'HGNC:10560']
 
     Returns:
-        A Pandas dataframe with two columns. It will contain at most one row per input identifier. The query column will
-        always contain a *list* to support the possibility of multiple mappings. In the example above, this will be:
+        A Pandas dataframe with two columns. Multiple mappings will be represented as separate rows. In the example
+        above, this will be:
                HGNC_ID      EnsemblGeneID
-            0  HGNC:10548   [ENSG00000124788]
-            1  HGNC:10560   [ENSG00000285258, ENSG00000163635]"""
+            0  HGNC:10548   ENSG00000124788
+            1  HGNC:10560   ENSG00000285258
+            2  HGNC:10560   ENSG00000163635"""
     biomart_key_column, df_key_column = key_column
     biomart_query_columns, df_query_columns = zip(*query_columns)
     result = ''
@@ -101,9 +103,4 @@ def query_biomart(key_column, query_columns, identifier_list):
         biomart_query = biomart_request_template.format(identifier_list=','.join(identifier_chunk))
         result += process_biomart_request(biomart_query)
     resulting_df = pd.read_table(StringIO(result), names=(df_key_column,)+df_query_columns, dtype=str)
-    # Group all potential mappings into lists.
-    # TODO this associates a list of results with each identifier, which does not work
-    #  for both genes and transcripts as it loses the association
-    #  Maybe see if can push this step one level up...
-    # resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_columns].apply(list).reset_index(names=df_query_columns)
     return resulting_df
diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
index 388d0d10..0ad662bb 100644
--- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
+++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
@@ -107,15 +107,17 @@ def annotate_ensembl_gene_info(variants, include_transcripts):
     if include_transcripts:
         query_columns.append(('ensembl_transcript_id', 'EnsemblTranscriptID'))
 
-    # This copy of the dataframe is required to facilitate filling in data using the `combine_first()` method. This
-    # allows us to apply priorities: e.g., if a gene ID was already populated using HGNC_ID, it will not be overwritten
-    # by a gene ID determined using GeneSymbol.
-    variants_original = variants.copy(deep=True)
+    # Make a copy of the variants dataframe to query. Variants will be removed from this copy as Ensembl gene IDs are
+    # found, ensuring that we don't make unnecessary queries.
+    variants_to_query = variants
+
+    # Empty dataframe to collect annotated rows
+    annotated_variants = pd.DataFrame()
 
     for column_name_in_dataframe, column_name_in_biomart, filtering_function in gene_annotation_sources:
         # Get all identifiers we want to query BioMart with
         identifiers_to_query = sorted({
-            i for i in variants[column_name_in_dataframe]
+            i for i in variants_to_query[column_name_in_dataframe]
             if filtering_function(i)
         })
         # Query BioMart for Ensembl Gene IDs
@@ -124,47 +126,40 @@ def annotate_ensembl_gene_info(variants, include_transcripts):
             query_columns=query_columns,
             identifier_list=identifiers_to_query,
         )
-        # TODO collapse to list (?)
 
         # Make note where the annotations came from
         annotation_info['GeneAnnotationSource'] = column_name_in_dataframe
-        # Combine the information we received with the *original* dataframe (a copy made before any iterations of this
-        # cycle were allowed to run). This is similar to SQL merge.
-        annotation_df = pd.merge(variants_original, annotation_info, on=column_name_in_dataframe, how='left')
-        # Update main dataframe with the new values. This replaces the NaN values in the dataframe with the ones
-        # available in another dataframe we just created, `annotation_df`.
-        # TODO check this combine step if the previous info is not in lists
-        variants = variants \
-            .set_index([column_name_in_dataframe]) \
-            .combine_first(annotation_df.set_index([column_name_in_dataframe]))
-
-    # TODO check all this exploding stuff for transcripts
-    # Reset index to default
-    variants.reset_index(inplace=True)
-    # Some records are being annotated to multiple Ensembl genes. For example, HGNC:10560 is being resolved to
-    # ENSG00000285258 and ENSG00000163635. We need to explode dataframe by that column.
-    variants = variants.explode('EnsemblGeneID')
-
-    # TODO another query to biomart, this one should not need to change
+        # Combine the information we received with the *original* dataframe using an outer join.
+        total_merge = pd.merge(variants, annotation_info, on=column_name_in_dataframe, how='outer', indicator=True)
+
+        # Variants annotated in this round are those present in both tables in the join, add these to the
+        # cumulative results.
+        annotated_variants = pd.concat((annotated_variants, total_merge[total_merge['_merge'] == 'both']))
+        annotated_variants.drop('_merge', axis=1, inplace=True)
+
+        # Variants that still need to be queried are those present only in the left table in the join.
+        variants_to_query = total_merge[total_merge['_merge'] == 'left_only']
+        variants_to_query.drop('_merge', axis=1, inplace=True)
+        if len(variants_to_query) == 0:
+            break
+
     # Based on the Ensembl gene ID, annotate (1) gene name and (2) which chromosome it is on
-    gene_query_columns = (
+    gene_query_columns = [
         ('external_gene_name', 'EnsemblGeneName'),
         ('chromosome_name', 'EnsemblChromosomeName'),
+    ]
+    annotation_info = biomart.query_biomart(
+        key_column=('ensembl_gene_id', 'EnsemblGeneID'),
+        query_columns=gene_query_columns,
+        identifier_list=sorted({str(i) for i in annotated_variants['EnsemblGeneID'] if str(i).startswith('ENSG')}),
     )
-    for column_name_in_biomart, column_name_in_dataframe in gene_query_columns:
-        annotation_info = biomart.query_biomart(
-            key_column=('ensembl_gene_id', 'EnsemblGeneID'),
-            query_columns=[(column_name_in_biomart, column_name_in_dataframe)],
-            identifier_list=sorted({str(i) for i in variants['EnsemblGeneID'] if str(i).startswith('ENSG')}),
-        )
-        variants = pd.merge(variants, annotation_info, on='EnsemblGeneID', how='left')
-        # Check that there are no multiple mappings for any given ID
-        assert variants[column_name_in_dataframe].str.len().dropna().max() == 1, \
-            'Found multiple gene ID → gene attribute mappings!'
-        # Convert the one-item list into a plain column
-        variants = variants.explode(column_name_in_dataframe)
+    annotated_variants = pd.merge(annotated_variants, annotation_info, on='EnsemblGeneID', how='left')
+    # Check that there are no multiple mappings for any given ID
+    # TODO replace this check
+    # assert variants[column_name_in_dataframe].str.len().dropna().max() == 1, \
+    #     'Found multiple gene ID → gene attribute mappings!'
 
-    return variants
+    return annotated_variants
 
 
 def determine_complete(row):
@@ -181,11 +176,10 @@ def determine_complete(row):
 
 def generate_consequences_file(consequences, output_consequences):
     """Output final table."""
-
     if consequences.empty:
         logger.info('There are no records ready for output')
         return
-    # Write the consequences table. This is used by the main evidence string generation pipeline.
+    # Write the consequences table. This is used by the main annotation pipeline.
     consequences.to_csv(output_consequences, sep='\t', index=False, header=False)
     # Output statistics
     logger.info(f'Generated {len(consequences)} consequences in total:')
@@ -194,6 +188,7 @@ def generate_consequences_file(consequences, output_consequences):
 
 
 def extract_consequences(variants, include_transcripts):
+    # TODO include transcript ID column in output
     # Generate consequences table
     consequences = variants[variants['RecordIsComplete']] \
         .groupby(['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName'], group_keys=False)['RepeatType'] \
diff --git a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
index 27384ee2..3313938d 100644
--- a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
+++ b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
@@ -4,7 +4,10 @@
 import os
 import tempfile
 
+import pandas as pd
+
 from cmat.consequence_prediction.repeat_expansion_variants import pipeline
+from cmat.consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info
 
 
 def get_test_resource(resource_name):
@@ -100,3 +103,35 @@ def test_missing_names_and_hgvs():
         # ref=T, alt=TACACACACACAC => classified as trinucleotide repeat without repeating unit inference.
         ['RCV001356600', 'ENSG00000136869', 'TLR4', 'trinucleotide_repeat_expansion']
     ]
+
+
+def test_annotate_genes_with_transcripts():
+    """Tests annotation with genes and transcripts, using multiple sources (HGNC, gene symbol, RefSeq transcript)"""
+    variants = pd.DataFrame([
+        ['variant_with_hgnc', 'RCV1', '-', 'HGNC:11850', None, None],
+        ['variant_with_gene_symbol', 'RCV2', 'PRDM12', '-', None, None],
+        ['variant_with_refseq', 'RCV3', '-', '-', 'NM_001377405', None],
+        ['variant_not_found', 'RCV4', 'blah', 'HGNC:blah', 'NM_blah', None]
+    ], columns=('Name', 'RCVaccession', 'GeneSymbol', 'HGNC_ID', 'TranscriptID', 'RepeatType'))
+    annotated_variants = annotate_ensembl_gene_info(variants, include_transcripts=True)
+    assert annotated_variants.shape == (7, 11)
+
+    # Helper function to check gene/transcript annotations for a particular variant
+    def assert_gene_transcript(name, arr):
+        assert (
+            annotated_variants[annotated_variants['Name'] == name][['EnsemblGeneID', 'EnsemblTranscriptID']]
+            .values == arr
+        ).all()
+
+    assert_gene_transcript('variant_with_hgnc', [
+        ['ENSG00000136869', 'ENST00000472304'],
+        ['ENSG00000136869', 'ENST00000394487'],
+        ['ENSG00000136869', 'ENST00000355622'],
+        ['ENSG00000136869', 'ENST00000490685'],
+    ])
+    assert_gene_transcript('variant_with_gene_symbol', [
+        ['ENSG00000130711', 'ENST00000253008'],
+        ['ENSG00000130711', 'ENST00000676323'],
+    ])
+    assert_gene_transcript('variant_with_refseq', [['ENSG00000163635', 'ENST00000674280']])
+    assert 'variant_not_found' not in annotated_variants['Name']

From a9776dbf03867fcffa0388a7d495eb1ab53e9fd8 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 6 Dec 2023 11:24:50 +0000
Subject: [PATCH 4/9] refactor uniqueness checks and add tests

---
 .../repeat_expansion_variants/pipeline.py     | 59 +++++++++++++------
 .../test_repeat_pipeline.py                   | 30 +++++++++-
 2 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
index 0ad662bb..97e83b08 100644
--- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
+++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
@@ -144,10 +144,10 @@ def annotate_ensembl_gene_info(variants, include_transcripts):
             break
 
     # Based on the Ensembl gene ID, annotate (1) gene name and (2) which chromosome it is on
-    gene_query_columns = [
+    gene_query_columns = (
         ('external_gene_name', 'EnsemblGeneName'),
         ('chromosome_name', 'EnsemblChromosomeName'),
-    ]
+    )
     annotation_info = biomart.query_biomart(
         key_column=('ensembl_gene_id', 'EnsemblGeneID'),
         query_columns=gene_query_columns,
@@ -155,20 +155,21 @@ def annotate_ensembl_gene_info(variants, include_transcripts):
     )
     annotated_variants = pd.merge(annotated_variants, annotation_info, on='EnsemblGeneID', how='left')
     # Check that there are no multiple mappings for any given ID
-    # TODO replace this check
-    # assert variants[column_name_in_dataframe].str.len().dropna().max() == 1, \
-    #     'Found multiple gene ID → gene attribute mappings!'
+    for _, column_name_in_dataframe in gene_query_columns:
+        assert_uniqueness(annotated_variants, ['EnsemblGeneID'], column_name_in_dataframe,
+                          'Found multiple gene ID → gene attribute mappings!')
 
     return annotated_variants
 
 
-def determine_complete(row):
+def determine_complete(row, include_transcripts):
     """Depending on the information, determine whether the record is complete, i.e., whether it has all necessary
         fields to be output for the final "consequences" table."""
     row['RecordIsComplete'] = (
         pd.notnull(row['EnsemblGeneID']) and
         pd.notnull(row['EnsemblGeneName']) and
         pd.notnull(row['RepeatType']) and
+        (pd.notnull(row['EnsemblTranscriptID']) if include_transcripts else True) and
         row['EnsemblChromosomeName'] in STANDARD_CHROMOSOME_NAMES
     )
     return row
@@ -187,22 +188,44 @@ def generate_consequences_file(consequences, output_consequences):
     logger.info(f'  {sum(consequences.RepeatType == "short_tandem_repeat_expansion")} short tandem repeat expansion')
 
 
+def assert_uniqueness(df, uniqueness_columns, target_column, error_msg):
+    """
+    Check uniqueness of values in target_column with respect to tuples in uniqueness_columns.
+
+    Args:
+        df: dataframe to check
+        uniqueness_columns: iterable of column names
+        target_column: name of column that should be unique
+        error_msg: message to report if uniqueness check fails
+    Returns:
+        result_df: input df containing only uniqueness_columns and target_column
+    """
+    result_df = df.groupby(uniqueness_columns, group_keys=False)[target_column].apply(set).reset_index(name=target_column)
+    if result_df.empty:
+        return result_df
+    assert result_df[target_column].apply(len).dropna().max() == 1, error_msg
+    # Get rid of sets
+    result_df[target_column] = result_df[target_column].apply(list)
+    result_df = result_df.explode(target_column)
+    return result_df
+
+
 def extract_consequences(variants, include_transcripts):
-    # TODO include transcript ID column in output
-    # Generate consequences table
-    consequences = variants[variants['RecordIsComplete']] \
-        .groupby(['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName'], group_keys=False)['RepeatType'] \
-        .apply(set).reset_index(name='RepeatType')
+    """Generate consequences table"""
+    # Check that for every (RCV, gene) pair or (RCV, gene, transcript) triple there is only one consequence type
+    unique_repeat_type_columns = ['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName']
+    if include_transcripts:
+        unique_repeat_type_columns.append('EnsemblTranscriptID')
+    consequences = assert_uniqueness(variants[variants['RecordIsComplete']], unique_repeat_type_columns, 'RepeatType',
+                                     'Multiple (RCV, gene) → variant type mappings!')
     if consequences.empty:
         return consequences
-    # Check that for every (RCV, gene) pair there is only one consequence type
-    assert consequences['RepeatType'].str.len().dropna().max() == 1, 'Multiple (RCV, gene) → variant type mappings!'
-    # Get rid of sets
-    consequences['RepeatType'] = consequences['RepeatType'].apply(list)
-    consequences = consequences.explode('RepeatType')
     # Form a four-column file compatible with the consequence mapping pipeline, for example:
     # RCV000005966    ENSG00000156475    PPP2R2B    trinucleotide_repeat_expansion
-    consequences = consequences[['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType']]
+    output_columns = ['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType']
+    if include_transcripts:
+        output_columns.append('EnsemblTranscriptID')
+    consequences = consequences[output_columns]
     consequences.sort_values(by=['RepeatType', 'RCVaccession', 'EnsemblGeneID'], inplace=True)
     # Check that there are no empty cells in the final consequences table
     assert consequences.isnull().to_numpy().sum() == 0
@@ -255,7 +278,7 @@ def main(clinvar_xml, include_transcripts, output_consequences=None, output_data
     variants = annotate_ensembl_gene_info(variants, include_transcripts)
 
     logger.info('Determine variant type and whether the record is complete')
-    variants = variants.apply(lambda row: determine_complete(row), axis=1)
+    variants = variants.apply(lambda row: determine_complete(row, include_transcripts), axis=1)
 
     logger.info('Postprocess data and output the two final tables')
     if output_dataframe is not None:
diff --git a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
index 3313938d..7403a9ed 100644
--- a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
+++ b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py
@@ -5,9 +5,10 @@
 import tempfile
 
 import pandas as pd
+import pytest
 
 from cmat.consequence_prediction.repeat_expansion_variants import pipeline
-from cmat.consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info
+from cmat.consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info, assert_uniqueness
 
 
 def get_test_resource(resource_name):
@@ -105,6 +106,33 @@ def test_missing_names_and_hgvs():
     ]
 
 
+def test_assert_uniqueness():
+    uniqueness_columns = ['letter', 'number']
+    target_column = 'target'
+    df = pd.DataFrame([
+        ['A', 1, 'not important', 'something'],
+        ['A', 1, 'not important', 'something'],
+        ['B', 2, 'not important', 'something else']
+    ], columns=uniqueness_columns + ['not important column', target_column])
+    result_df = assert_uniqueness(df, uniqueness_columns, target_column, 'failure')
+    assert result_df.equals(pd.DataFrame([
+        ['A', 1, 'something'],
+        ['B', 2, 'something else']
+    ], columns=uniqueness_columns + [target_column]))
+
+
+def test_assert_uniqueness_failure():
+    uniqueness_columns = ['letter', 'number']
+    target_column = 'target'
+    df = pd.DataFrame([
+        ['A', 1, 'something'],
+        ['A', 1, 'something else'],
+        ['B', 2, 'something']
+    ], columns=uniqueness_columns + [target_column])
+    with pytest.raises(AssertionError):
+        assert_uniqueness(df, uniqueness_columns, target_column, 'failure')
+
+
 def test_annotate_genes_with_transcripts():
     """Tests annotation with genes and transcripts, using multiple sources (HGNC, gene symbol, RefSeq transcript)"""
     variants = pd.DataFrame([

From 2d14a3e59972e3e88610de54f74d5fa36f79ac27 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 6 Dec 2023 12:49:07 +0000
Subject: [PATCH 5/9] add flags to scripts

---
 bin/consequence_prediction/run_repeat_expansion_variants.py | 6 +++++-
 bin/consequence_prediction/run_structural_variants.py       | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/bin/consequence_prediction/run_repeat_expansion_variants.py b/bin/consequence_prediction/run_repeat_expansion_variants.py
index a1ef8cdb..078c8a08 100755
--- a/bin/consequence_prediction/run_repeat_expansion_variants.py
+++ b/bin/consequence_prediction/run_repeat_expansion_variants.py
@@ -9,6 +9,10 @@
     '--clinvar-xml', required=True,
     help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
 )
+parser.add_argument(
+    '--include-transcripts', required=False, action='store_true',
+    help='Whether to include transcript IDs along with consequence terms'
+)
 parser.add_argument(
     '--output-consequences', required=True,
     help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
@@ -18,4 +22,4 @@
     help='File to output full dataframe for subsequent analysis and debugging.'
 )
 args = parser.parse_args()
-pipeline.main(args.clinvar_xml, args.output_consequences, args.output_dataframe)
+pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences, args.output_dataframe)
diff --git a/bin/consequence_prediction/run_structural_variants.py b/bin/consequence_prediction/run_structural_variants.py
index e58cf57f..3ef5093c 100755
--- a/bin/consequence_prediction/run_structural_variants.py
+++ b/bin/consequence_prediction/run_structural_variants.py
@@ -9,10 +9,14 @@
     '--clinvar-xml', required=True,
     help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)'
 )
+parser.add_argument(
+    '--include-transcripts', required=False, action='store_true',
+    help='Whether to include transcript IDs along with consequence terms'
+)
 parser.add_argument(
     '--output-consequences', required=True,
     help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.'
 )
 
 args = parser.parse_args()
-pipeline.main(args.clinvar_xml, args.output_consequences)
+pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences)

From db26e08d0d704a558cafe63da05774238910d8d3 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Thu, 7 Dec 2023 15:41:46 +0000
Subject: [PATCH 6/9] WIP for transcripts in annotation

---
 bin/evaluation/map_genes.py                 |  3 ++-
 cmat/output_generation/annotated_clinvar.py |  1 +
 cmat/output_generation/consequence_type.py  |  1 +
 pipelines/annotation_pipeline.nf            | 17 +++++++++++------
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/bin/evaluation/map_genes.py b/bin/evaluation/map_genes.py
index f74c82e2..3b89c085 100644
--- a/bin/evaluation/map_genes.py
+++ b/bin/evaluation/map_genes.py
@@ -53,7 +53,8 @@ def load_clinvar_data(clinvar_xml):
 def main(clinvar_xml, output_file):
     """Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV."""
     variants = load_clinvar_data(clinvar_xml)
-    annotated_variants = annotate_ensembl_gene_info(variants)
+    # Don't include transcripts for evaluation
+    annotated_variants = annotate_ensembl_gene_info(variants, include_transcripts=False)
     annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False)
 
 
diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py
index bba3bb42..7c05dd20 100644
--- a/cmat/output_generation/annotated_clinvar.py
+++ b/cmat/output_generation/annotated_clinvar.py
@@ -91,6 +91,7 @@ def annotate(self, record):
             self.overall_counts['both_measure_and_trait'] += 1
 
     def annotate_and_count_measure(self, record):
+        # TODO include transcript if present in variant_to_gene_mappings
         consequence_types, variant_category = get_consequence_types(record.measure, self.variant_to_gene_mappings)
         record.measure.add_ensembl_annotations(consequence_types)
 
diff --git a/cmat/output_generation/consequence_type.py b/cmat/output_generation/consequence_type.py
index 48ad3bb8..30ae58a4 100644
--- a/cmat/output_generation/consequence_type.py
+++ b/cmat/output_generation/consequence_type.py
@@ -15,6 +15,7 @@ def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term):
 
 
 def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
+    # TODO adapt for transcripts if present
     """
     Return a dictionary of consequence information extracted from the given file.
     If consequence_type_dict is provided then the information will be merge into this dictionary.
diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf
index 60d38f85..1fc9fb48 100644
--- a/pipelines/annotation_pipeline.nf
+++ b/pipelines/annotation_pipeline.nf
@@ -8,11 +8,12 @@ def helpMessage() {
     Generate ClinVar evidence strings for Open Targets, or annotated ClinVar XML.
     
     Params:
-        --output_dir    Directory for output
-        --schema        Open Targets JSON schema version (optional, will output XML if omitted)
-        --clinvar       ClinVar XML file (optional, will download latest if omitted)
-        --mappings      Trait mappings file (optional, will use a default path if omitted)
-        --evaluate      Whether to run evaluation or not (default false)
+        --output_dir           Directory for output
+        --schema               Open Targets JSON schema version (optional, will output XML if omitted)
+        --clinvar              ClinVar XML file (optional, will download latest if omitted)
+        --mappings             Trait mappings file (optional, will use a default path if omitted)
+        --include_transcripts  Whether to include transcripts in consequences (default false)
+        --evaluate             Whether to run evaluation or not (default false)
     """
 }
 
@@ -21,6 +22,7 @@ params.output_dir = null
 params.schema = null
 params.clinvar = null
 params.mappings = '${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv'
+params.include_transcripts = false
 params.evaluate = false
 
 if (params.help) {
@@ -31,7 +33,7 @@ if (!params.output_dir) {
 }
 batchRoot = params.output_dir
 codeRoot = "${projectDir}/.."
-
+includeTranscriptsFlag = params.include_transcripts ? "--include-transcripts" : ""
 
 /*
  * Main workflow.
@@ -135,6 +137,7 @@ process runSnpIndel {
         -N 200               `# Number of records (lines) per worker`                                     \
         --tmpdir .           `# Store temporary files in the current directory to avoid /tmp overflow`    \
         \${PYTHON_BIN} "${codeRoot}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \
+            ${includeTranscriptsFlag} \
     | sort -u > consequences_snp.tsv
     """
 }
@@ -161,6 +164,7 @@ process runRepeat {
    """
    \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_repeat_expansion_variants.py \
         --clinvar-xml ${clinvarXml} \
+        ${includeTranscriptsFlag} \
         --output-consequences consequences_repeat.tsv
 
     # create an empty file if nothing generated
@@ -191,6 +195,7 @@ process runStructural {
    """
    \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_structural_variants.py \
         --clinvar-xml ${clinvarXml} \
+        ${includeTranscriptsFlag} \
         --output-consequences consequences_structural.tsv
 
     # create an empty file if nothing generated

From 7716caa6416440cbdedcb05dbd0887097a1daa96 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 11 Dec 2023 14:29:09 +0000
Subject: [PATCH 7/9] add transcript id to annotated xml output

---
 cmat/output_generation/annotated_clinvar.py   |  11 ++++++++---
 cmat/output_generation/consequence_type.py    |  17 +++++++++++------
 .../expected_annotation_output.xml.gz         | Bin 2167 -> 2174 bytes
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py
index 7c05dd20..3c17afe5 100644
--- a/cmat/output_generation/annotated_clinvar.py
+++ b/cmat/output_generation/annotated_clinvar.py
@@ -91,7 +91,6 @@ def annotate(self, record):
             self.overall_counts['both_measure_and_trait'] += 1
 
     def annotate_and_count_measure(self, record):
-        # TODO include transcript if present in variant_to_gene_mappings
         consequence_types, variant_category = get_consequence_types(record.measure, self.variant_to_gene_mappings)
         record.measure.add_ensembl_annotations(consequence_types)
 
@@ -242,8 +241,14 @@ def add_ensembl_annotations(self, consequences):
             attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ')
             so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term),
                                                 'DB': 'Sequence Ontology'})
-            ensembl_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_gene_id, 'DB': 'Ensembl'})
-            attr_set_elt.extend((attribute_elt, so_elt, ensembl_elt))
+            ensembl_gene_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_gene_id,
+                                                          'DB': 'Ensembl Gene'})
+            attr_set_elt.extend((attribute_elt, so_elt, ensembl_gene_elt))
+            # Add transcript if present
+            if consequence_attributes.ensembl_transcript_id:
+                ensembl_transcript_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_transcript_id,
+                                                                    'DB': 'Ensembl Transcript'})
+                attr_set_elt.append(ensembl_transcript_elt)
             consequence_elts.append(attr_set_elt)
         self.measure_xml.extend(consequence_elts)
 
diff --git a/cmat/output_generation/consequence_type.py b/cmat/output_generation/consequence_type.py
index 30ae58a4..2b4097ff 100644
--- a/cmat/output_generation/consequence_type.py
+++ b/cmat/output_generation/consequence_type.py
@@ -10,15 +10,14 @@
 logger = logging.getLogger(__package__)
 
 
-def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term):
-    consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term)))
+def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term, ensembl_transcript_id=None):
+    consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term), ensembl_transcript_id))
 
 
 def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
-    # TODO adapt for transcripts if present
     """
     Return a dictionary of consequence information extracted from the given file.
-    If consequence_type_dict is provided then the information will be merge into this dictionary.
+    If consequence_type_dict is provided then the information will be merged into this dictionary.
     """
     logger.info('Loading mapping rs -> ENSG/SOterms')
     if consequence_type_dict is None:
@@ -41,7 +40,12 @@ def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None):
                 logger.warning('Skip line with missing gene ID: {}'.format(line))
                 continue
 
-            process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)
+            # Include transcript if present
+            if len(line_list) >= 5:
+                ensembl_transcript_id = line_list[4]
+                process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term, ensembl_transcript_id)
+            else:
+                process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term)
 
     logger.info('{} rs->ENSG/SOterms mappings loaded'.format(len(consequence_type_dict)))
     return consequence_type_dict
@@ -112,9 +116,10 @@ class ConsequenceType:
     with relationship to ensembl gene IDs and SO terms
     """
 
-    def __init__(self, ensembl_gene_id, so_term):
+    def __init__(self, ensembl_gene_id, so_term, ensembl_transcript_id=None):
         self.ensembl_gene_id = ensembl_gene_id
         self.so_term = so_term
+        self.ensembl_transcript_id = ensembl_transcript_id
 
     def __eq__(self, other):
         return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
diff --git a/tests/output_generation/resources/expected_annotation_output.xml.gz b/tests/output_generation/resources/expected_annotation_output.xml.gz
index 11e586b44c9fb6ff121ab487ea3885e87cba7fdd..fd2b56cd58246150388794d5aebdff3ae8a51947 100644
GIT binary patch
literal 2174
zcmV-^2!Zz>iwFp59Cu{`|8!+@bYE|EbZ~WaE_iKh0Nq;cZ{j!;|Gs~Pm6IOb=}Hpk
z9UkDs0?SrgXdhtv+<vM;a3Q+jWE0aa|NV{gh6ACct8KM+Ri!rec;+`Vo)<f5|M6`T
zpa&8sen>kKmQ@K6>V-?6t~%1^`G<xhp@g|~=>{Ps9cf1r>5uNa_LKy~O~{NeG;kSN
zb|g*J^o9z*nuO-N2x#xOK_DRzK$F(D#P3LJ#-f&@JU%|kkGdSjD@9XPto%8?nt5xo
zaT`A6vOEH-62C>miMt_5<a#8VzY0B<r9g8|_bigBxA0{eY-H-M<yH8gMB9bp1wMUn
zW93`2yo2Y!_llwX@ea$DD#NsduH1y(#Gyx$q|^gz4U8pJ4!_@Zr2e=!mq7JCFuHYP
zP`&#OA+W?fiNR(hAMXKu%u%At0a#Sm97C5<BXL2Wqh)C>t)UtXEuZYiq?o{9M7yCG
z66X&R3dSDC!WqMV9ckLXRpG}}v6HiS5pQ%`I=%20aFmo>=9GA0yqqzYZIiCIjX^`!
zRw|==aK7gTGk-<>d-xA5kxivs9(?kc)6)>SrER>}XC(Gr6vg3#za&e}Tp%tu3`hc`
z5nt22^8X^i4n;0ohbsb7+ln}y;ZaTlYVTc>DGb}ePd5}|AzJzg-_fN7t`|W3K$e%Z
z1T%9tQBZ2fX<~onQ_zeFdgOz9s#!!lpCoARehX>1*<FGsnwDxw-Su|k(zcQl2uphR
z_rwF2m8KH~98B(4O}3i_iRUDN$u4dymFGeUrK&}HOu(E=flD^PEz}LTlR!vJP)LJa
zTM_0;epLpg-C*OnX$UlJt?B4kmqh&-i~Yrxk=zr4AEiTW3(~zrQAFYPLwA81ZUVr0
zhRS5za&J>6rJzj9wjtN0k#w6!QZ^*Sg<zwUSI#Jb<QNb{w_ZR(1}M>k$lAt++7y-L
zS#19Z1H*u9Af9gBU|aCKiFq1{$x`Z0#&_7%ZPk#iU+3fD*s_0Xc`{^5ZT_DX`)?op
z8p-oGvMi?;uYPMQA~!**QWe|oyEM!K<7RN#o%HXx&Z;A0+-mA})7EX=nz#B8Ko!Co
zpXHga#+hY`>PT19(VgWutjsPSN4K*Jl$DkX1k+Ntq^amv<8rP}z4-4$rJaFF160^H
zo3D=wJPvgb8=9`-YGUrMUAh98B9eOOP=;pYYr1<%UNU}56|zk$;I~Bld;dO(AB}sH
z;S8D!k98y#Z%HK{-g=)@p+e0GH@;WhCaslh*rxUtCLV!jo4BdgZ;=X)6TmYgj|k;v
zE!e>>c{?47v?J<JQ6kSp+LL;)FhT)&Akbd=l#vyQ)9lxgbVsVDwnK(*uAEv?&Tv$y
z{OWniT|gCto+#WVM{3#?HPzN3>sM8CdZ%iJscC|m+Fna4^A5UBLnteaEZhNo!9><t
z;)=J<QRX>0s&s>pi_F)}MOE=3BR(xU(;uJaBEfxWZ(%w%^cGSrif*~6CD6X6-~MNQ
z{L*s{AZG2)35eN>mO%K!@tqH*uor$+JmNmTNm~Hjv~9h<Ry^lqp`8$T$@r~24pO0u
zZemk_ub;mGD)2K#vi^qF|7T&(K#W7^xwe5D_d}YHuUmejA$2!C9GT&{IN{U^2X21?
z==AV2yKX^E!e)9W@gAOYR_L0tFbG#WF^h={pXggNPPti>f0@#?t?gTDysylN#V*ax
z;xAR-!p)bef1(L^J@s$0Aj^UgUlGO#SYin#_&;Q|=_|$76H+0jUnsS|WBm4uFyAZg
zL%Ysbupx${d(E7150(3+m_qzMY_kY<9ceV}uc3D?N6LIwrm#`}L)LF%Lxqea&}P?A
z&_89jggn*79m$buTVt7b`w4SnINlYVIUfik@p<VdQQ+?GjtKV?)dl%hy&`nQ)@iAW
z!o=rF-u6F;gmH0O+Y16XWu~)p3Aq8dgt|-|?pL<nbH=mJH=br58*ks~KW#mAZ@s+X
zFRfQTA6l<wqPpNY>y?DxZN2;gZAyaFc*)w2^4NIv+1jTlzf)VKM^iDD4lmE1y*^75
z*anOXe@RhQk!7E{C_BC?<KEtF7Gdxrx!G0SQ&NtZ{){1M8jfM?L(<VT%hc;aGABF?
z$zs8bJ7U?9em}_mKVcx*maW=Y2qYaF7On}Tg77Gi@&yNhWEpxBpSDC<2b<9M*jdp_
z?`CSq^6Gs#&hD5kwdpuUw&a?Rp5Xj(H0ifAM{imgEq|_7)SAc?i;d4*;e`ng&)5yB
z5qdd5vv3=GJRG)*F^>ga$~D7iww23)D6C*7jFnKcGutn>J!lGS%dvBVe5O7xpP$}W
zU<)>hlguQ#V>!BkoxBNIE@qRP${MkrmhTsY7kfW(pE#V%M%lKvjHa2d9@2E@qr)vz
z-#L_?E(Z?Y54`?MiE5tRYuE2#!!XQzJM;H<pi=oBPT9Z6$w%g~>$ClPok9NMF53lA
z<Vy)DVLA*UP+s{L^xyx6W=y=lNPv<ZUB=;tTp$J%NPi;tm%5IxCj-6<Fm^PYDVz>f
z9eCiHbKv!g8VvA;{ZGTI#%TKa{cwtAUnYa;^>}!J=3j1x&!arbG`4Qx`b={Kw5Gy6
zW64jm9}DOZ!I_W*>pfk^Yb2a~3>fvM1LQ9cFLNCm%IJx}Pv<gK@9%xc6yZIZx-mid
zCQi;e9vx)yIq#6Nbngbrkmo^2LX8FR18}{?0!k>Pmu(#K)WS%@E|3O0_W87rb<1gD
z{UvGttbo|33hWq`ZsxTCgbJCWN@-e!;o=C+U4)&gALnhXW4unYz&reK&5L~2f4G!}
zjDKw*9oHCTp9#~{RAjg!p{3uGC?`KC>AuUpZYVxJdB<ss&v3eb0|Rav56dF}0Cq<<
AxBvhE

literal 2167
zcmV--2#EI|iwFo>LcwGL|8!+@bYE|EbZ~WaE_iKh0Nq+!Z{o-jexF~_@=8a$+Qj!O
z2Usz{Fwq2(15D1&Q)EmBtTk<q-6X@m-)djmU=lK#pxKQgCGM`S`l_ny(rvf@__hhq
zBaRb4BppE!WdUL0g-f5TI>P7qhf7;P33bWR4MKuD!VV|GAKiEDDGsok;2EZ9;8MKo
z2&$}VmoogS0-En4pncc|fq*~&Nm}0$zay+EjariQ^z<Y?X<`_!BvqCb>Cf@?%v<A)
zd+8G<%fqlL@mnOExEq{Au7|Vv>(Fy)3N+{RKqHZO3tuF`MkM}PT!oKPv|UJE;FCu;
zmcAv+dw33fuNcZ7?-kLMMVJ=QwVTk}IP`FmRC-vKswtpy`2DUU^vAuq0IK(a(VZKE
z>fLt;fh8Vr3^v30cn|1fh7w&3z@nOJ>za@ni3$1~ElYE$m$H7T=9B#x7ZXY*Ul$mE
zkWeu87#7AD{_6<S{+$dzhOF2*i&ybRccs${e*s59%4JTm7sksOb?G+gdfON@q;07-
zT7mODH<<Y=;y=KDV2Nxh<?`s`r<|UI$SrN-#XiNc@1iIUAN?g>GUgm{!C`<CAdT3X
z?zR6H4t6MV={j6tklL2`=?sr_98kM*O{Oqx2S43Vh=yqCCu~QT7Pwge@grVdkrK?z
z-9$mD9ixf;l}|u3%IT30R#dYHdp=Ik-2E1kaI?DtPdKJ*3f=W~<C3<N6L3p<4-eP_
zmzAdD1?){$t0vpc0>^Wlz+_ixOSR`h38}6{dyK)HOO8u6z%0}an3F(Aj8RB}U0dSj
zN`7?)rQKlTxoHSEmg?A=q6xfyl*ayIOL6WA&X2;rwgqV=krz?8ecxT6hMNFzmZ35k
zcg)*_3MnYVv@FPVX(Zj{k(3Q_alzRr<&`r^AUOsYk*yctkOE5d$g{Q-U3Pe7Sr)4v
zVPF{G4aC!(8*B@nw=qj2F<uJY$@pF|G)vY+^Vj)!I5w@{TAm1*Ql0;2MgQAJzeeIb
zj!e_;#jD@i63>m3D%Hid`z{Hyz_=Y;btnCMrn78|iqdj4%ds>|Y0X=G2%s8agU|BJ
z*Wk=DMRkPh>FD0HZCYlRkE6TU1<Fdx1%hdzThi3@Yj8PJr&;`WqEb&lr2;C&GMu+Z
z1s;bwsOYLDEA_<OU%O-lFnJ`+(4h>?@Yi(rl)OacoGN6SmcwuI_$&WDiyw`9li>`S
z3y*dL8gFqe9?rc_>QJHPgd5+hZ<E$a)Gb3jhlxkvSx0f?<}FggaRhi~<Pj#!tOYyR
zCFj$jNISd^H6`*~q&}+$4I>obM-1(yPbgmDIL&??L9>N=YTIXccID)na=I-;<(E%W
z?gFYH^hDt{IZ)H8sVTSiS-+{8(|cLf4OQjTR4Xls$U5jI4WX=DX5kL#3nrr4;#ZtI
zM;WK&sL~BwE;8OW7uCgw6#Jy;On-cui#YeCy@g?0&|3(ND7xjM7DxM*e*2&C@k`I%
zgP65HM<8Y^S{&gI$9FoI!d}Xo;^FuCZQ258hGl8Zwc;fwbL|AjOH|I~agYjKw4)dt
zeC_lNP=lW_616j0|DT0D12GPv=h_Bt+z&~Dzi!!$hS1&kaAbz(;)K&E9Ju`npwq+8
z?4|`ZNiotpiTCiFvqCq7hC#U6@mY*t_)OoLamvji{mYbYY;Di2aaEZSja`zR#b2ww
zsW`7y|40+?cIw||L6!xha*Zj)V2LG|;Qx@-rf(Em&q#%sex=m@j<VY?OntAo5AE7t
z!G;PP-5ch(d#Kzm#S~)qVVgy;>j<N1e+|8JIa21cGKG!$AF_T^(PhX;9Bp<D1^rWY
zOUP1P+!1V{u{9Q1x1UfqhT~n)nX`c~;-8m(5(Vz={(!KWs4vL3>NO@Sx=u@76ed1X
z@~;1ZCya~R+FlUADKnj&3&;(?CB&utaKE<oUNW9_y75%w(0Em+|Gf3&%6fUjUs|tx
z-nU-EM18?a)+-6W+j{v0+7t(=@shP4<+1VP)3r}fey6rdkEVPq?O&d~czu>8unia&
z{^FvlBGWo{QFeG$#=O1TEW+Sba<i+t=cF7m{TV~j&~06>Lekb$)6kkiGAG;*$zs8b
zIbzune&5UfKVcwQrX^bn7f70-n@U3<6@&+YlrPu|BvaQM<+vrvI_N;(V`W7zy_;zu
z%d7X*IJ;vuWyiMlY{?B9J;M3pXwq+~w&s`_Eqkt4)P~69i;YiR?u7{p&)5y>5qdR1
zvv3=GEF89rF^dIO%2i!=+S1j47gn(2#!9H!neCUm9yA4(X<NBLJ`<mn&rk1busNH=
zNoEqwHf>E;?7RtCE@qS4+8Vx|mhTt17kfW(pE#V%M%lJEb;rn84@tW7Uv7{}Czk_z
zuLj=!eMCLQR*Lm|imvNMzLVLjI|zXC6`WMP!^ubHvFp?7mChi4XP0dNsPCnOlrSBJ
z-Cy4J7xdr%a%POZzi@z(9a+ZV245fw6i9m}?U%ZaZzcn_0Tjhnl}zDusP4c$*PH^c
zSCn6XH|&2JUSE!;pWhFsX!d0?nBI(s7ij+FcK9;NgCt{VrqY~b4uCckw-+q=Y4&3Q
z?ISo55@)^VOL&8XlTQGn-gJQc<^DacZ9xS+67%U?rs~6k4@n|?KvOryDBr}%NhhPd
z1U_fYQI_f5KpL_X2>Iu70sH`5Z?S;t37KUZhpZ4$oX`uT!j648<13nJJBs$2jDJ!<
zRH;I-byG9)vH(K4Oi`ya%))SS1V=7RkCl$oHr63t+bQr4KishTp0yq>At7ZSS_p^b
tMXEAknwp9XS0uFbQxfUuw<O(n*@q3qcPH-{ZSe(8_is(`sg%Ye007sqJ+S})


From 598645f2cb66ac1e91b26eb2d676fca501cd4173 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Mon, 11 Dec 2023 14:39:49 +0000
Subject: [PATCH 8/9] update documentation

---
 README.md                                                       | 1 +
 .../repeat_expansion_variants/pipeline.py                       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7e9e5864..87c0676b 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,7 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \
   --mappings ${LATEST_MAPPINGS} \
   -resume
 ```
+You can use the `--include_transcripts` flag to also include transcript annotations with the functional consequences.
 
 By default, the pipeline will download and annotate the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag.
 
diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
index 97e83b08..6002fcb5 100644
--- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
+++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
@@ -249,7 +249,7 @@ def main(clinvar_xml, include_transcripts, output_consequences=None, output_data
 
     Args:
         clinvar_xml: filepath to the ClinVar XML file.
-        include_transcripts:
+        include_transcripts: whether to include transcript IDs along with consequence terms.
         output_consequences: filepath to the output file with variant consequences. The file uses a 6-column format
             compatible with the VEP mapping pipeline (see /consequence_prediction/README.md).
         output_dataframe: filepath to the output file with the full dataframe used in the analysis. This will contain

From 4f8d1b15ffc24baa23e3cba340f9d0f2d660629c Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Tue, 19 Dec 2023 08:36:27 +0000
Subject: [PATCH 9/9] update log statement

---
 .../repeat_expansion_variants/pipeline.py                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
index 6002fcb5..9c2bff8f 100644
--- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
+++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py
@@ -274,7 +274,7 @@ def main(clinvar_xml, include_transcripts, output_consequences=None, output_data
         logger.info('No variants to process')
         return None
 
-    logger.info('Match each record to Ensembl gene ID and name')
+    logger.info(f'Match each record to Ensembl gene ID{", transcript ID," if include_transcripts else ""} and gene name')
     variants = annotate_ensembl_gene_info(variants, include_transcripts)
 
     logger.info('Determine variant type and whether the record is complete')