From 09b9c7cf9e72dd8cb2829e0b2ec047545917a9d0 Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 30 Nov 2023 15:18:46 +0000 Subject: [PATCH 1/9] add transcript flag for VEP --- .../run_structural_variants.py | 2 +- cmat/consequence_prediction/common/biomart.py | 1 + cmat/consequence_prediction/common/vep.py | 38 ++++++---- .../snp_indel_variants/pipeline.py | 19 +++-- .../consequence_prediction/common/test_vep.py | 75 +++++++++++++++++-- 5 files changed, 108 insertions(+), 27 deletions(-) diff --git a/bin/consequence_prediction/run_structural_variants.py b/bin/consequence_prediction/run_structural_variants.py index ad5d8207..e58cf57f 100755 --- a/bin/consequence_prediction/run_structural_variants.py +++ b/bin/consequence_prediction/run_structural_variants.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""A wrapper script for running the repeat expansion pipeline.""" +"""A wrapper script for running the structural variant pipeline.""" import argparse from cmat.consequence_prediction.structural_variants import pipeline diff --git a/cmat/consequence_prediction/common/biomart.py b/cmat/consequence_prediction/common/biomart.py index 2bd3ac55..3152ab58 100644 --- a/cmat/consequence_prediction/common/biomart.py +++ b/cmat/consequence_prediction/common/biomart.py @@ -65,6 +65,7 @@ def split_into_chunks(lst, max_size, delimiter=','): return chunks +# TODO flag to get transcripts def query_biomart(key_column, query_column, identifier_list): """Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe. diff --git a/cmat/consequence_prediction/common/vep.py b/cmat/consequence_prediction/common/vep.py index 6a8b65c5..eae1ce74 100755 --- a/cmat/consequence_prediction/common/vep.py +++ b/cmat/consequence_prediction/common/vep.py @@ -65,38 +65,49 @@ def load_consequence_severity_rank(): return {term: index for index, term in enumerate(get_severity_ranking())} -def most_severe_consequence(consequence_terms, consequence_term_severity_rank): - return min(consequence_terms, key=lambda term: consequence_term_severity_rank[term]) +def most_severe_consequence(consequence_terms_with_transcripts, consequence_term_severity_rank): + return min(consequence_terms_with_transcripts, + key=lambda term_and_transcript: consequence_term_severity_rank[term_and_transcript[0]])[0] -def most_severe_consequence_per_gene(variant_identifier, consequences): +def most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts): results = [] consequence_term_severity_rank = load_consequence_severity_rank() consequences_per_gene = defaultdict(list) for c in consequences: key = (c['gene_id'], c.get('gene_symbol', '')) - consequences_per_gene[key].extend(term for term in c['consequence_terms']) - for (gene_id, gene_symbol), terms in consequences_per_gene.items(): - most_severe_consequence_term = most_severe_consequence(terms, consequence_term_severity_rank) - results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term)) + # Keep track of consequence term alongside transcript ID + consequences_per_gene[key].extend((term, c['transcript_id']) for term in c['consequence_terms']) + for (gene_id, gene_symbol), terms_with_transcripts in consequences_per_gene.items(): + most_severe_consequence_term = most_severe_consequence(terms_with_transcripts, consequence_term_severity_rank) + # If we're including transcripts, need to include every transcript associated with the most severe consequence + if include_transcripts: + for term, transcript_id in terms_with_transcripts: + if term == most_severe_consequence_term: + results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term, transcript_id)) + else: + results.append((variant_identifier, gene_id, gene_symbol, most_severe_consequence_term)) return results -def overall_most_severe_consequence(variant_identifier, consequences): +def overall_most_severe_consequence(variant_identifier, consequences, include_transcripts): results = [] consequence_term_severity_rank = load_consequence_severity_rank() # Flatten the list of consequence terms and find the most severe one - all_consequence_terms = [term for c in consequences for term in c['consequence_terms']] + all_consequence_terms = [(term, c['transcript_id']) for c in consequences for term in c['consequence_terms']] most_severe_consequence_term = most_severe_consequence(all_consequence_terms, consequence_term_severity_rank) # Keep only consequences which include the most severe consequence term. for c in consequences: if most_severe_consequence_term in c['consequence_terms']: - results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term)) + if include_transcripts: + results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term, c['transcript_id'])) + else: + results.append((variant_identifier, c['gene_id'], c.get('gene_symbol', ''), most_severe_consequence_term)) return results -def extract_consequences(vep_results, acceptable_biotypes): +def extract_consequences(vep_results, acceptable_biotypes, include_transcripts): """Given VEP results, return a list of consequences matching certain criteria. Args: @@ -104,6 +115,7 @@ def extract_consequences(vep_results, acceptable_biotypes): acceptable_biotypes: a list of transcript biotypes to consider (as defined in Ensembl documentation, see https://www.ensembl.org/info/genome/genebuild/biotypes.html). Consequences for other transcript biotypes will be ignored. + include_transcripts: whether to include transcript IDs alongside consequence terms """ results_by_variant = defaultdict(list) for result in vep_results: @@ -120,11 +132,11 @@ def extract_consequences(vep_results, acceptable_biotypes): # For genes overlapping the variant, we report the most severe consequence per gene. if overlapping_consequences: - consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences) + consequences_for_variant = most_severe_consequence_per_gene(variant_identifier, overlapping_consequences, include_transcripts) # If there are no consequences on overlapping genes, we take the overall most severe consequence and all genes # associated with that consequence else: - consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences) + consequences_for_variant = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts) results_by_variant[variant_identifier].extend(consequences_for_variant) return results_by_variant diff --git a/cmat/consequence_prediction/snp_indel_variants/pipeline.py b/cmat/consequence_prediction/snp_indel_variants/pipeline.py index cf3a3951..a33a0c6a 100755 --- a/cmat/consequence_prediction/snp_indel_variants/pipeline.py +++ b/cmat/consequence_prediction/snp_indel_variants/pipeline.py @@ -15,6 +15,10 @@ from cmat.consequence_prediction.common.vep import query_vep, extract_consequences, deduplicate_list parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--include-transcripts', required=False, action='store_true', + help='Whether to include transcript IDs along with consequence terms' +) logging.basicConfig() logger = logging.getLogger('consequence_mapping') @@ -106,14 +110,15 @@ def get_variants_without_consequences(results_by_variant): }) -def process_variants(variants): +def process_variants(variants, include_transcripts): """Given a list of variant IDs, return a list of consequence types (each including Ensembl gene name & ID and a functional consequence code) for a given variant. """ # Query VEP with default parameters, looking for variants affecting protein coding and miRNA transcripts # up to a standard distance (5000 nucleotides either way, which is default for VEP) from the variant. vep_results = query_vep(variants=variants) - results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'}) + results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'}, + include_transcripts=include_transcripts) # See if there are variants with no consequences up to the default distance variants_without_consequences = get_variants_without_consequences(results_by_variant) @@ -136,9 +141,13 @@ def main(): variants_to_query = [colon_based_id_to_vep_id(v) for v in sys.stdin.read().splitlines()] # Query VEP with all variants at once (for the purpose of efficiency), print out the consequences to STDOUT. - consequences = process_variants(variants_to_query) - for variant_id, gene_id, gene_symbol, consequence_term in consequences: - print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term])) + consequences = process_variants(variants_to_query, args.include_transcripts) + if args.include_transcripts: + for variant_id, gene_id, gene_symbol, consequence_term, transcript_id in consequences: + print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term, transcript_id])) + else: + for variant_id, gene_id, gene_symbol, consequence_term in consequences: + print('\t'.join([vep_id_to_colon_id(variant_id), gene_id, gene_symbol, consequence_term])) logger.info('Successfully processed {} variants'.format(len(variants_to_query))) diff --git a/tests/consequence_prediction/common/test_vep.py b/tests/consequence_prediction/common/test_vep.py index c7d5581a..dccc0891 100644 --- a/tests/consequence_prediction/common/test_vep.py +++ b/tests/consequence_prediction/common/test_vep.py @@ -11,35 +11,48 @@ def get_vep_results(): 'gene_symbol': 'MASTL', 'gene_id': 'ENSG00000120539', 'biotype': 'protein_coding', - 'consequence_terms': ['missense_variant'] + 'consequence_terms': ['missense_variant'], + 'transcript_id': 'ENST00000342386' + }, + { # same consequence and gene, different transcript + 'gene_symbol': 'MASTL', + 'gene_id': 'ENSG00000120539', + 'biotype': 'protein_coding', + 'consequence_terms': ['missense_variant'], + 'transcript_id': 'ENST00000477034' }, { # missing gene symbol 'gene_id': 'ENSG00000120538', 'biotype': 'protein_coding', - 'consequence_terms': ['missense_variant'] + 'consequence_terms': ['missense_variant'], + 'transcript_id': 'ENST00000342386' }, { # less severe consequence on same gene 'gene_id': 'ENSG00000120538', 'biotype': 'protein_coding', - 'consequence_terms': ['stop_retained_variant'] + 'consequence_terms': ['stop_retained_variant'], + 'transcript_id': 'ENST00000342386' }, { # less severe consequence on distinct gene 'gene_id': 'ENSG00000120537', 'biotype': 'protein_coding', - 'consequence_terms': ['stop_retained_variant'] + 'consequence_terms': ['stop_retained_variant'], + 'transcript_id': 'ENST00000342385' }, { # non-overlapping consequences 'gene_symbol': 'ACBD5', 'gene_id': 'ENSG00000107897', 'biotype': 'protein_coding', 'distance': 5, - 'consequence_terms': ['3_prime_UTR_variant'] + 'consequence_terms': ['3_prime_UTR_variant'], + 'transcript_id': 'ENST00000676731' }, { # other biotypes 'gene_symbol': 'MIR125A', 'gene_id': 'ENSG00000208008', 'biotype': 'miRNA', - 'consequence_terms': ['missense_variant'] + 'consequence_terms': ['missense_variant'], + 'transcript_id': 'ENST00000385273' } ] } @@ -52,6 +65,7 @@ def test_extract_consequences(): results = extract_consequences( vep_results=vep_results, acceptable_biotypes=['protein_coding'], + include_transcripts=False ) assert results == {'10 27169969 . C A': [ ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'), @@ -60,23 +74,53 @@ def test_extract_consequences(): ]} +def test_extract_consequences_with_transcripts(): + """Verifies behaviour of extract_consequences with transcript flag set.""" + vep_results = get_vep_results() + results = extract_consequences( + vep_results=vep_results, + acceptable_biotypes=['protein_coding'], + include_transcripts=True + ) + assert results == {'10 27169969 . C A': [ + ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000342386'), + ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000477034'), + ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant', 'ENST00000342386'), + ('10 27169969 . C A', 'ENSG00000120537', '', 'stop_retained_variant', 'ENST00000342385'), + ]} + + def test_overall_most_severe_consequence(): vep_results = get_vep_results() variant_identifier = vep_results[0]['input'] consequences = vep_results[0]['transcript_consequences'] - results = overall_most_severe_consequence(variant_identifier, consequences) + results = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts=False) assert results == [ + ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'), ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'), ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant'), ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant') ] +def test_overall_most_severe_consequence_with_transcripts(): + vep_results = get_vep_results() + variant_identifier = vep_results[0]['input'] + consequences = vep_results[0]['transcript_consequences'] + results = overall_most_severe_consequence(variant_identifier, consequences, include_transcripts=True) + assert results == [ + ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000342386'), + ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000477034'), + ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant', 'ENST00000342386'), + ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant', 'ENST00000385273') + ] + + def test_most_severe_consequence_per_gene(): vep_results = get_vep_results() variant_identifier = vep_results[0]['input'] consequences = vep_results[0]['transcript_consequences'] - results = most_severe_consequence_per_gene(variant_identifier, consequences) + results = most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts=False) assert results == [ ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant'), ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant'), @@ -84,3 +128,18 @@ def test_most_severe_consequence_per_gene(): ('10 27169969 . C A', 'ENSG00000107897', 'ACBD5', '3_prime_UTR_variant'), ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant') ] + + +def test_most_severe_consequence_per_gene_with_transcripts(): + vep_results = get_vep_results() + variant_identifier = vep_results[0]['input'] + consequences = vep_results[0]['transcript_consequences'] + results = most_severe_consequence_per_gene(variant_identifier, consequences, include_transcripts=True) + assert results == [ + ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000342386'), + ('10 27169969 . C A', 'ENSG00000120539', 'MASTL', 'missense_variant', 'ENST00000477034'), + ('10 27169969 . C A', 'ENSG00000120538', '', 'missense_variant', 'ENST00000342386'), + ('10 27169969 . C A', 'ENSG00000120537', '', 'stop_retained_variant', 'ENST00000342385'), + ('10 27169969 . C A', 'ENSG00000107897', 'ACBD5', '3_prime_UTR_variant', 'ENST00000676731'), + ('10 27169969 . C A', 'ENSG00000208008', 'MIR125A', 'missense_variant', 'ENST00000385273') + ] From 1cb35be29520273ae4c17a855c01ae4e0cd6ae39 Mon Sep 17 00:00:00 2001 From: April Shen Date: Fri, 1 Dec 2023 16:12:37 +0000 Subject: [PATCH 2/9] WIP on getting transcripts from biomart --- cmat/consequence_prediction/common/biomart.py | 62 +++++++++++-------- .../repeat_expansion_variants/pipeline.py | 24 ++++--- .../structural_variants/pipeline.py | 18 ++++-- .../common/test_biomart.py | 12 ++++ .../test_repeat_pipeline.py | 2 +- .../test_structural_pipeline.py | 10 ++- 6 files changed, 86 insertions(+), 42 deletions(-) create mode 100644 tests/consequence_prediction/common/test_biomart.py diff --git a/cmat/consequence_prediction/common/biomart.py b/cmat/consequence_prediction/common/biomart.py index 3152ab58..fd665159 100644 --- a/cmat/consequence_prediction/common/biomart.py +++ b/cmat/consequence_prediction/common/biomart.py @@ -15,24 +15,32 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -# The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the -# `key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask -# BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id" -# column, which contains stable Ensembl gene ID. -BIOMART_REQUEST_TEMPLATE = """http://www.ensembl.org/biomart/martservice?query= - - - - - - - -""".replace('\n', '') - # This is to avoid getting a "URL too long" exception. MAX_REQUEST_LENGTH = 5000 +def build_biomart_request_template(key_column, query_columns): + """ + The idea behind this template is that we query BioMart with a list of identifiers (`identifier_list`) from the + `key_column`. For example, it can be a "hgnc_id" column, which contains a HGNC ID of a given gene. We then ask + BioMart to return all mappings from that column to the `query_column`. For example, it can be the "ensembl_gene_id" + column, which contains stable Ensembl gene ID. + + Note `identifier_list` is left to be filled in later, to ensure the identifiers can be chunked appropriately. + """ + biomart_request_template = f"""http://www.ensembl.org/biomart/martservice?query= + + + + + + """ + for query_column in query_columns: + biomart_request_template += f'' + biomart_request_template += '' + return biomart_request_template.replace('\n', '') + + # Since we are using an external API call here, the @retry decorator will ensure that any sporadic network errors will # be handled and the request will be retried. @retry(tries=10, delay=5, backoff=1.2, jitter=(1, 3), logger=logger) @@ -65,14 +73,13 @@ def split_into_chunks(lst, max_size, delimiter=','): return chunks -# TODO flag to get transcripts -def query_biomart(key_column, query_column, identifier_list): +def query_biomart(key_column, query_columns, identifier_list): """Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe. Args: key_column: A tuple of key column names in Ensembl and in the resulting dataframe, e.g. ('hgnc_id', 'HGNC_ID') - query_column: A tuple of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID') + query_columns: A list of tuples of query column names, similar to `key_column`, e.g. ('ensembl_gene_id', 'EnsemblGeneID') identifier_list: List of identifiers to query, e.g. ['HGNC:10548', 'HGNC:10560'] Returns: @@ -82,18 +89,21 @@ def query_biomart(key_column, query_column, identifier_list): 0 HGNC:10548 [ENSG00000124788] 1 HGNC:10560 [ENSG00000285258, ENSG00000163635]""" biomart_key_column, df_key_column = key_column - biomart_query_column, df_query_column = query_column + biomart_query_columns, df_query_columns = zip(*query_columns) result = '' - for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(BIOMART_REQUEST_TEMPLATE)): + biomart_request_template = build_biomart_request_template( + key_column=biomart_key_column, + query_columns=biomart_query_columns + ) + for identifier_chunk in split_into_chunks(identifier_list, MAX_REQUEST_LENGTH-len(biomart_request_template)): logger.info(f'Processing chunk of {len(identifier_chunk)} records') # Construct BioMart query from the template (see explanation above). - biomart_query = BIOMART_REQUEST_TEMPLATE.format( - key_column=biomart_key_column, - query_column=biomart_query_column, - identifier_list=','.join(identifier_chunk) - ) + biomart_query = biomart_request_template.format(identifier_list=','.join(identifier_chunk)) result += process_biomart_request(biomart_query) - resulting_df = pd.read_table(StringIO(result), names=(df_key_column, df_query_column), dtype=str) + resulting_df = pd.read_table(StringIO(result), names=(df_key_column,)+df_query_columns, dtype=str) # Group all potential mappings into lists. - resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_column].apply(list).reset_index(name=df_query_column) + # TODO this associates a list of results with each identifier, which does not work + # for both genes and transcripts as it loses the association + # Maybe see if can push this step one level up... + # resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_columns].apply(list).reset_index(names=df_query_columns) return resulting_df diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py index cdd98837..388d0d10 100644 --- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py +++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py @@ -92,7 +92,7 @@ def load_clinvar_data(clinvar_xml): return variants.sort_values(by=['Name']), stats -def annotate_ensembl_gene_info(variants): +def annotate_ensembl_gene_info(variants, include_transcripts): """Annotate the `variants` dataframe with information about Ensembl gene ID and name""" # Ensembl gene ID can be determined using three ways, listed in the order of decreasing priority. Having multiple @@ -103,6 +103,10 @@ def annotate_ensembl_gene_info(variants): ('GeneSymbol', 'external_gene_name', lambda i: i != '-'), ('TranscriptID', 'refseq_mrna', lambda i: pd.notnull(i)), ) + query_columns = [('ensembl_gene_id', 'EnsemblGeneID')] + if include_transcripts: + query_columns.append(('ensembl_transcript_id', 'EnsemblTranscriptID')) + # This copy of the dataframe is required to facilitate filling in data using the `combine_first()` method. This # allows us to apply priorities: e.g., if a gene ID was already populated using HGNC_ID, it will not be overwritten # by a gene ID determined using GeneSymbol. @@ -117,9 +121,11 @@ def annotate_ensembl_gene_info(variants): # Query BioMart for Ensembl Gene IDs annotation_info = biomart.query_biomart( key_column=(column_name_in_biomart, column_name_in_dataframe), - query_column=('ensembl_gene_id', 'EnsemblGeneID'), + query_columns=query_columns, identifier_list=identifiers_to_query, ) + # TODO collapse to list (?) + # Make note where the annotations came from annotation_info['GeneAnnotationSource'] = column_name_in_dataframe # Combine the information we received with the *original* dataframe (a copy made before any iterations of this @@ -127,16 +133,19 @@ def annotate_ensembl_gene_info(variants): annotation_df = pd.merge(variants_original, annotation_info, on=column_name_in_dataframe, how='left') # Update main dataframe with the new values. This replaces the NaN values in the dataframe with the ones # available in another dataframe we just created, `annotation_df`. + # TODO check this combine step if the previous info is not in lists variants = variants \ .set_index([column_name_in_dataframe]) \ .combine_first(annotation_df.set_index([column_name_in_dataframe])) + # TODO check all this exploding stuff for transcripts # Reset index to default variants.reset_index(inplace=True) # Some records are being annotated to multiple Ensembl genes. For example, HGNC:10560 is being resolved to # ENSG00000285258 and ENSG00000163635. We need to explode dataframe by that column. variants = variants.explode('EnsemblGeneID') + # TODO another query to biomart, this one should not need to change # Based on the Ensembl gene ID, annotate (1) gene name and (2) which chromosome it is on gene_query_columns = ( ('external_gene_name', 'EnsemblGeneName'), @@ -145,7 +154,7 @@ def annotate_ensembl_gene_info(variants): for column_name_in_biomart, column_name_in_dataframe in gene_query_columns: annotation_info = biomart.query_biomart( key_column=('ensembl_gene_id', 'EnsemblGeneID'), - query_column=(column_name_in_biomart, column_name_in_dataframe), + query_columns=[(column_name_in_biomart, column_name_in_dataframe)], identifier_list=sorted({str(i) for i in variants['EnsemblGeneID'] if str(i).startswith('ENSG')}), ) variants = pd.merge(variants, annotation_info, on='EnsemblGeneID', how='left') @@ -184,7 +193,7 @@ def generate_consequences_file(consequences, output_consequences): logger.info(f' {sum(consequences.RepeatType == "short_tandem_repeat_expansion")} short tandem repeat expansion') -def extract_consequences(variants): +def extract_consequences(variants, include_transcripts): # Generate consequences table consequences = variants[variants['RecordIsComplete']] \ .groupby(['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName'], group_keys=False)['RepeatType'] \ @@ -217,11 +226,12 @@ def generate_all_variants_file(output_dataframe, variants): variants.to_csv(output_dataframe, sep='\t', index=False) -def main(clinvar_xml, output_consequences=None, output_dataframe=None): +def main(clinvar_xml, include_transcripts, output_consequences=None, output_dataframe=None): """Process data and generate output files. Args: clinvar_xml: filepath to the ClinVar XML file. + include_transcripts: output_consequences: filepath to the output file with variant consequences. The file uses a 6-column format compatible with the VEP mapping pipeline (see /consequence_prediction/README.md). output_dataframe: filepath to the output file with the full dataframe used in the analysis. This will contain @@ -247,7 +257,7 @@ def main(clinvar_xml, output_consequences=None, output_dataframe=None): return None logger.info('Match each record to Ensembl gene ID and name') - variants = annotate_ensembl_gene_info(variants) + variants = annotate_ensembl_gene_info(variants, include_transcripts) logger.info('Determine variant type and whether the record is complete') variants = variants.apply(lambda row: determine_complete(row), axis=1) @@ -255,7 +265,7 @@ def main(clinvar_xml, output_consequences=None, output_dataframe=None): logger.info('Postprocess data and output the two final tables') if output_dataframe is not None: generate_all_variants_file(output_dataframe, variants) - consequences = extract_consequences(variants) + consequences = extract_consequences(variants, include_transcripts) if output_consequences is not None: generate_consequences_file(consequences, output_consequences) diff --git a/cmat/consequence_prediction/structural_variants/pipeline.py b/cmat/consequence_prediction/structural_variants/pipeline.py index cca67c71..eff2c814 100644 --- a/cmat/consequence_prediction/structural_variants/pipeline.py +++ b/cmat/consequence_prediction/structural_variants/pipeline.py @@ -79,20 +79,26 @@ def generate_consequences_file(consequences, output_consequences): consequences.to_csv(output_consequences, sep='\t', index=False, header=False) -def main(clinvar_xml, output_consequences=None): +def main(clinvar_xml, include_transcripts, output_consequences=None): vep_results = get_vep_results(clinvar_xml) - results_by_variant = extract_consequences(vep_results=vep_results, acceptable_biotypes={'protein_coding', 'miRNA'}) + results_by_variant = extract_consequences( + vep_results=vep_results, + acceptable_biotypes={'protein_coding', 'miRNA'}, + include_transcripts=include_transcripts) variant_data = [] for variant_id, variant_consequences in results_by_variant.items(): for consequence_to_yield in deduplicate_list(variant_consequences): - variant_id, gene_id, gene_symbol, consequence_term = consequence_to_yield + # * operator lets us keep all the annotations, regardless of whether transcript is there or not + variant_id, *annotations = consequence_to_yield # variant_id here is the entire input to VEP, we only want the HGVS that we passed as an identifier hgvs_id = variant_id.split()[-1] - variant_data.append((hgvs_id, gene_id, gene_symbol, consequence_term)) + variant_data.append([hgvs_id] + annotations) # Return as a dataframe to be compatible with repeat expansion pipeline - consequences = pd.DataFrame(variant_data, columns=('VariantID', 'EnsemblGeneID', - 'EnsemblGeneName', 'ConsequenceTerm')) + column_names = ['VariantID', 'EnsemblGeneID', 'EnsemblGeneName', 'ConsequenceTerm'] + if include_transcripts: + column_names.append('EnsemblTranscriptID') + consequences = pd.DataFrame(variant_data, columns=column_names) if output_consequences is not None: generate_consequences_file(consequences, output_consequences) return consequences diff --git a/tests/consequence_prediction/common/test_biomart.py b/tests/consequence_prediction/common/test_biomart.py new file mode 100644 index 00000000..6421b462 --- /dev/null +++ b/tests/consequence_prediction/common/test_biomart.py @@ -0,0 +1,12 @@ +from cmat.consequence_prediction.common.biomart import query_biomart + + +def test_query_biomart(): + key_column = ('hgnc_id', 'HGNC_ID') + query_columns = [('ensembl_gene_id', 'EnsemblGeneID'), ('ensembl_transcript_id', 'EnsemblTranscriptID')] + identifier_list = ['HGNC:10548', 'HGNC:10560'] + result_df = query_biomart(key_column, query_columns, identifier_list) + assert set(result_df.columns) == {'HGNC_ID', 'EnsemblGeneID', 'EnsemblTranscriptID'} + assert len(result_df) == 35 + assert set(result_df['HGNC_ID']) == set(identifier_list) + assert set(result_df['EnsemblGeneID']) == {'ENSG00000163635', 'ENSG00000124788'} diff --git a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py index 2dc466e1..27384ee2 100644 --- a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py +++ b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py @@ -24,7 +24,7 @@ def run_pipeline(resource_name): """Runs the pipeline on a given test resource and returns the output consequences as a list of lists.""" input_filename = get_test_resource(resource_name) output_consequences, output_dataframe = [tempfile.NamedTemporaryFile(delete=False) for _ in range(2)] - pipeline.main(input_filename, output_consequences.name, output_dataframe.name) + pipeline.main(input_filename, False, output_consequences.name, output_dataframe.name) consequences = [line.rstrip().split('\t') for line in open(output_consequences.name).read().splitlines()] for temp_file in (output_consequences, output_dataframe): os.remove(temp_file.name) diff --git a/tests/consequence_prediction/structural_variants/test_structural_pipeline.py b/tests/consequence_prediction/structural_variants/test_structural_pipeline.py index b22e70d4..42edaea1 100644 --- a/tests/consequence_prediction/structural_variants/test_structural_pipeline.py +++ b/tests/consequence_prediction/structural_variants/test_structural_pipeline.py @@ -16,10 +16,10 @@ def get_test_resource(resource_name): return os.path.join(module_directory, 'resources', resource_name) -def run_pipeline(resource_name): +def run_pipeline(resource_name, include_transcripts=False): """Runs the pipeline on a given test resource and returns the output consequences as a list of lists.""" input_filename = get_test_resource(resource_name) - consequences_df = pipeline.main(input_filename) + consequences_df = pipeline.main(input_filename, include_transcripts) consequences = [[col for col in row] for row in consequences_df.itertuples(index=False)] return consequences @@ -35,6 +35,12 @@ def test_successful_run(): ]) +def test_successful_run_with_transcripts(): + results = run_pipeline('precise_genomic.xml.gz', include_transcripts=True) + assert len(results) == 28 + assert ['NC_000001.11:g.25271785_25329047del', 'ENSG00000187010', 'RHD', 'stop_lost', 'ENST00000454452'] in results + + def test_has_complete_coordinates(): assert run_pipeline('complete_coordinates.xml.gz') == [] From 21498752dffce77a494629f8bdca9d31111f6781 Mon Sep 17 00:00:00 2001 From: April Shen Date: Tue, 5 Dec 2023 17:52:23 +0000 Subject: [PATCH 3/9] add transcript flag in repeat expansion pipeline --- cmat/consequence_prediction/common/biomart.py | 21 +++--- .../repeat_expansion_variants/pipeline.py | 75 +++++++++---------- .../test_repeat_pipeline.py | 35 +++++++++ 3 files changed, 79 insertions(+), 52 deletions(-) diff --git a/cmat/consequence_prediction/common/biomart.py b/cmat/consequence_prediction/common/biomart.py index fd665159..a571597a 100644 --- a/cmat/consequence_prediction/common/biomart.py +++ b/cmat/consequence_prediction/common/biomart.py @@ -3,7 +3,7 @@ (HGNC gene ID, gene symbol, RefSeq transcript ID) to Ensembl gene ID and name. The BioMart API is a bit quirky in that it uses XML to specify the request. """ - +import re from io import StringIO import logging @@ -38,7 +38,7 @@ def build_biomart_request_template(key_column, query_columns): for query_column in query_columns: biomart_request_template += f'' biomart_request_template += '' - return biomart_request_template.replace('\n', '') + return re.sub(r'\n *', '', biomart_request_template) # Since we are using an external API call here, the @retry decorator will ensure that any sporadic network errors will @@ -75,7 +75,8 @@ def split_into_chunks(lst, max_size, delimiter=','): def query_biomart(key_column, query_columns, identifier_list): """Query Ensembl BioMart with a list of identifiers (`identifier_list`) from one column (`key_column`) and return - all mappings from those identifiers to another column (`query_column`) in form of a two-column Pandas dataframe. + all mappings from those identifiers to one ore more other column (`query_columns`) in form of a two-column Pandas + dataframe. Args: key_column: A tuple of key column names in Ensembl and in the resulting dataframe, e.g. ('hgnc_id', 'HGNC_ID') @@ -83,11 +84,12 @@ def query_biomart(key_column, query_columns, identifier_list): identifier_list: List of identifiers to query, e.g. ['HGNC:10548', 'HGNC:10560'] Returns: - A Pandas dataframe with two columns. It will contain at most one row per input identifier. The query column will - always contain a *list* to support the possibility of multiple mappings. In the example above, this will be: + A Pandas dataframe with two columns. Multiple mappings will be represented as separate rows. In the example + above, this will be: HGNC_ID EnsemblGeneID - 0 HGNC:10548 [ENSG00000124788] - 1 HGNC:10560 [ENSG00000285258, ENSG00000163635]""" + 0 HGNC:10548 ENSG00000124788 + 1 HGNC:10560 ENSG00000285258 + 2 HGNC:10560 ENSG00000163635""" biomart_key_column, df_key_column = key_column biomart_query_columns, df_query_columns = zip(*query_columns) result = '' @@ -101,9 +103,4 @@ def query_biomart(key_column, query_columns, identifier_list): biomart_query = biomart_request_template.format(identifier_list=','.join(identifier_chunk)) result += process_biomart_request(biomart_query) resulting_df = pd.read_table(StringIO(result), names=(df_key_column,)+df_query_columns, dtype=str) - # Group all potential mappings into lists. - # TODO this associates a list of results with each identifier, which does not work - # for both genes and transcripts as it loses the association - # Maybe see if can push this step one level up... - # resulting_df = resulting_df.groupby(df_key_column, group_keys=False)[df_query_columns].apply(list).reset_index(names=df_query_columns) return resulting_df diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py index 388d0d10..0ad662bb 100644 --- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py +++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py @@ -107,15 +107,17 @@ def annotate_ensembl_gene_info(variants, include_transcripts): if include_transcripts: query_columns.append(('ensembl_transcript_id', 'EnsemblTranscriptID')) - # This copy of the dataframe is required to facilitate filling in data using the `combine_first()` method. This - # allows us to apply priorities: e.g., if a gene ID was already populated using HGNC_ID, it will not be overwritten - # by a gene ID determined using GeneSymbol. - variants_original = variants.copy(deep=True) + # Make a copy of the variants dataframe to query. Variants will be removed from this copy as Ensembl gene IDs are + # found, ensuring that we don't make unnecessary queries. + variants_to_query = variants + + # Empty dataframe to collect annotated rows + annotated_variants = pd.DataFrame() for column_name_in_dataframe, column_name_in_biomart, filtering_function in gene_annotation_sources: # Get all identifiers we want to query BioMart with identifiers_to_query = sorted({ - i for i in variants[column_name_in_dataframe] + i for i in variants_to_query[column_name_in_dataframe] if filtering_function(i) }) # Query BioMart for Ensembl Gene IDs @@ -124,47 +126,40 @@ def annotate_ensembl_gene_info(variants, include_transcripts): query_columns=query_columns, identifier_list=identifiers_to_query, ) - # TODO collapse to list (?) # Make note where the annotations came from annotation_info['GeneAnnotationSource'] = column_name_in_dataframe - # Combine the information we received with the *original* dataframe (a copy made before any iterations of this - # cycle were allowed to run). This is similar to SQL merge. - annotation_df = pd.merge(variants_original, annotation_info, on=column_name_in_dataframe, how='left') - # Update main dataframe with the new values. This replaces the NaN values in the dataframe with the ones - # available in another dataframe we just created, `annotation_df`. - # TODO check this combine step if the previous info is not in lists - variants = variants \ - .set_index([column_name_in_dataframe]) \ - .combine_first(annotation_df.set_index([column_name_in_dataframe])) - - # TODO check all this exploding stuff for transcripts - # Reset index to default - variants.reset_index(inplace=True) - # Some records are being annotated to multiple Ensembl genes. For example, HGNC:10560 is being resolved to - # ENSG00000285258 and ENSG00000163635. We need to explode dataframe by that column. - variants = variants.explode('EnsemblGeneID') - - # TODO another query to biomart, this one should not need to change + # Combine the information we received with the *original* dataframe using an outer join. + total_merge = pd.merge(variants, annotation_info, on=column_name_in_dataframe, how='outer', indicator=True) + + # Variants annotated in this round are those present in both tables in the join, add these to the + # cumulative results. + annotated_variants = pd.concat((annotated_variants, total_merge[total_merge['_merge'] == 'both'])) + annotated_variants.drop('_merge', axis=1, inplace=True) + + # Variants that still need to be queried are those present only in the left table in the join. + variants_to_query = total_merge[total_merge['_merge'] == 'left_only'] + variants_to_query.drop('_merge', axis=1, inplace=True) + if len(variants_to_query) == 0: + break + # Based on the Ensembl gene ID, annotate (1) gene name and (2) which chromosome it is on - gene_query_columns = ( + gene_query_columns = [ ('external_gene_name', 'EnsemblGeneName'), ('chromosome_name', 'EnsemblChromosomeName'), + ] + annotation_info = biomart.query_biomart( + key_column=('ensembl_gene_id', 'EnsemblGeneID'), + query_columns=gene_query_columns, + identifier_list=sorted({str(i) for i in annotated_variants['EnsemblGeneID'] if str(i).startswith('ENSG')}), ) - for column_name_in_biomart, column_name_in_dataframe in gene_query_columns: - annotation_info = biomart.query_biomart( - key_column=('ensembl_gene_id', 'EnsemblGeneID'), - query_columns=[(column_name_in_biomart, column_name_in_dataframe)], - identifier_list=sorted({str(i) for i in variants['EnsemblGeneID'] if str(i).startswith('ENSG')}), - ) - variants = pd.merge(variants, annotation_info, on='EnsemblGeneID', how='left') - # Check that there are no multiple mappings for any given ID - assert variants[column_name_in_dataframe].str.len().dropna().max() == 1, \ - 'Found multiple gene ID → gene attribute mappings!' - # Convert the one-item list into a plain column - variants = variants.explode(column_name_in_dataframe) + annotated_variants = pd.merge(annotated_variants, annotation_info, on='EnsemblGeneID', how='left') + # Check that there are no multiple mappings for any given ID + # TODO replace this check + # assert variants[column_name_in_dataframe].str.len().dropna().max() == 1, \ + # 'Found multiple gene ID → gene attribute mappings!' - return variants + return annotated_variants def determine_complete(row): @@ -181,11 +176,10 @@ def determine_complete(row): def generate_consequences_file(consequences, output_consequences): """Output final table.""" - if consequences.empty: logger.info('There are no records ready for output') return - # Write the consequences table. This is used by the main evidence string generation pipeline. + # Write the consequences table. This is used by the main annotation pipeline. consequences.to_csv(output_consequences, sep='\t', index=False, header=False) # Output statistics logger.info(f'Generated {len(consequences)} consequences in total:') @@ -194,6 +188,7 @@ def generate_consequences_file(consequences, output_consequences): def extract_consequences(variants, include_transcripts): + # TODO include transcript ID column in output # Generate consequences table consequences = variants[variants['RecordIsComplete']] \ .groupby(['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName'], group_keys=False)['RepeatType'] \ diff --git a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py index 27384ee2..3313938d 100644 --- a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py +++ b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py @@ -4,7 +4,10 @@ import os import tempfile +import pandas as pd + from cmat.consequence_prediction.repeat_expansion_variants import pipeline +from cmat.consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info def get_test_resource(resource_name): @@ -100,3 +103,35 @@ def test_missing_names_and_hgvs(): # ref=T, alt=TACACACACACAC => classified as trinucleotide repeat without repeating unit inference. ['RCV001356600', 'ENSG00000136869', 'TLR4', 'trinucleotide_repeat_expansion'] ] + + +def test_annotate_genes_with_transcripts(): + """Tests annotation with genes and transcripts, using multiple sources (HGNC, gene symbol, RefSeq transcript)""" + variants = pd.DataFrame([ + ['variant_with_hgnc', 'RCV1', '-', 'HGNC:11850', None, None], + ['variant_with_gene_symbol', 'RCV2', 'PRDM12', '-', None, None], + ['variant_with_refseq', 'RCV3', '-', '-', 'NM_001377405', None], + ['variant_not_found', 'RCV4', 'blah', 'HGNC:blah', 'NM_blah', None] + ], columns=('Name', 'RCVaccession', 'GeneSymbol', 'HGNC_ID', 'TranscriptID', 'RepeatType')) + annotated_variants = annotate_ensembl_gene_info(variants, include_transcripts=True) + assert annotated_variants.shape == (7, 11) + + # Helper function to check gene/transcript annotations for a particular variant + def assert_gene_transcript(name, arr): + assert ( + annotated_variants[annotated_variants['Name'] == name][['EnsemblGeneID', 'EnsemblTranscriptID']] + .values == arr + ).all() + + assert_gene_transcript('variant_with_hgnc', [ + ['ENSG00000136869', 'ENST00000472304'], + ['ENSG00000136869', 'ENST00000394487'], + ['ENSG00000136869', 'ENST00000355622'], + ['ENSG00000136869', 'ENST00000490685'], + ]) + assert_gene_transcript('variant_with_gene_symbol', [ + ['ENSG00000130711', 'ENST00000253008'], + ['ENSG00000130711', 'ENST00000676323'], + ]) + assert_gene_transcript('variant_with_refseq', [['ENSG00000163635', 'ENST00000674280']]) + assert 'variant_not_found' not in annotated_variants['Name'] From a9776dbf03867fcffa0388a7d495eb1ab53e9fd8 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 6 Dec 2023 11:24:50 +0000 Subject: [PATCH 4/9] refactor uniqueness checks and add tests --- .../repeat_expansion_variants/pipeline.py | 59 +++++++++++++------ .../test_repeat_pipeline.py | 30 +++++++++- 2 files changed, 70 insertions(+), 19 deletions(-) diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py index 0ad662bb..97e83b08 100644 --- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py +++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py @@ -144,10 +144,10 @@ def annotate_ensembl_gene_info(variants, include_transcripts): break # Based on the Ensembl gene ID, annotate (1) gene name and (2) which chromosome it is on - gene_query_columns = [ + gene_query_columns = ( ('external_gene_name', 'EnsemblGeneName'), ('chromosome_name', 'EnsemblChromosomeName'), - ] + ) annotation_info = biomart.query_biomart( key_column=('ensembl_gene_id', 'EnsemblGeneID'), query_columns=gene_query_columns, @@ -155,20 +155,21 @@ def annotate_ensembl_gene_info(variants, include_transcripts): ) annotated_variants = pd.merge(annotated_variants, annotation_info, on='EnsemblGeneID', how='left') # Check that there are no multiple mappings for any given ID - # TODO replace this check - # assert variants[column_name_in_dataframe].str.len().dropna().max() == 1, \ - # 'Found multiple gene ID → gene attribute mappings!' + for _, column_name_in_dataframe in gene_query_columns: + assert_uniqueness(annotated_variants, ['EnsemblGeneID'], column_name_in_dataframe, + 'Found multiple gene ID → gene attribute mappings!') return annotated_variants -def determine_complete(row): +def determine_complete(row, include_transcripts): """Depending on the information, determine whether the record is complete, i.e., whether it has all necessary fields to be output for the final "consequences" table.""" row['RecordIsComplete'] = ( pd.notnull(row['EnsemblGeneID']) and pd.notnull(row['EnsemblGeneName']) and pd.notnull(row['RepeatType']) and + (pd.notnull(row['EnsemblTranscriptID']) if include_transcripts else True) and row['EnsemblChromosomeName'] in STANDARD_CHROMOSOME_NAMES ) return row @@ -187,22 +188,44 @@ def generate_consequences_file(consequences, output_consequences): logger.info(f' {sum(consequences.RepeatType == "short_tandem_repeat_expansion")} short tandem repeat expansion') +def assert_uniqueness(df, uniqueness_columns, target_column, error_msg): + """ + Check uniqueness of values in target_column with respect to tuples in uniqueness_columns. + + Args: + df: dataframe to check + uniqueness_columns: iterable of column names + target_column: name of column that should be unique + error_msg: message to report if uniqueness check fails + Returns: + result_df: input df containing only uniqueness_columns and target_column + """ + result_df = df.groupby(uniqueness_columns, group_keys=False)[target_column].apply(set).reset_index(name=target_column) + if result_df.empty: + return result_df + assert result_df[target_column].apply(len).dropna().max() == 1, error_msg + # Get rid of sets + result_df[target_column] = result_df[target_column].apply(list) + result_df = result_df.explode(target_column) + return result_df + + def extract_consequences(variants, include_transcripts): - # TODO include transcript ID column in output - # Generate consequences table - consequences = variants[variants['RecordIsComplete']] \ - .groupby(['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName'], group_keys=False)['RepeatType'] \ - .apply(set).reset_index(name='RepeatType') + """Generate consequences table""" + # Check that for every (RCV, gene) pair or (RCV, gene, transcript) triple there is only one consequence type + unique_repeat_type_columns = ['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName'] + if include_transcripts: + unique_repeat_type_columns.append('EnsemblTranscriptID') + consequences = assert_uniqueness(variants[variants['RecordIsComplete']], unique_repeat_type_columns, 'RepeatType', + 'Multiple (RCV, gene) → variant type mappings!') if consequences.empty: return consequences - # Check that for every (RCV, gene) pair there is only one consequence type - assert consequences['RepeatType'].str.len().dropna().max() == 1, 'Multiple (RCV, gene) → variant type mappings!' - # Get rid of sets - consequences['RepeatType'] = consequences['RepeatType'].apply(list) - consequences = consequences.explode('RepeatType') # Form a four-column file compatible with the consequence mapping pipeline, for example: # RCV000005966 ENSG00000156475 PPP2R2B trinucleotide_repeat_expansion - consequences = consequences[['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType']] + output_columns = ['RCVaccession', 'EnsemblGeneID', 'EnsemblGeneName', 'RepeatType'] + if include_transcripts: + output_columns.append('EnsemblTranscriptID') + consequences = consequences[output_columns] consequences.sort_values(by=['RepeatType', 'RCVaccession', 'EnsemblGeneID'], inplace=True) # Check that there are no empty cells in the final consequences table assert consequences.isnull().to_numpy().sum() == 0 @@ -255,7 +278,7 @@ def main(clinvar_xml, include_transcripts, output_consequences=None, output_data variants = annotate_ensembl_gene_info(variants, include_transcripts) logger.info('Determine variant type and whether the record is complete') - variants = variants.apply(lambda row: determine_complete(row), axis=1) + variants = variants.apply(lambda row: determine_complete(row, include_transcripts), axis=1) logger.info('Postprocess data and output the two final tables') if output_dataframe is not None: diff --git a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py index 3313938d..7403a9ed 100644 --- a/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py +++ b/tests/consequence_prediction/repeat_expansion_variants/test_repeat_pipeline.py @@ -5,9 +5,10 @@ import tempfile import pandas as pd +import pytest from cmat.consequence_prediction.repeat_expansion_variants import pipeline -from cmat.consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info +from cmat.consequence_prediction.repeat_expansion_variants.pipeline import annotate_ensembl_gene_info, assert_uniqueness def get_test_resource(resource_name): @@ -105,6 +106,33 @@ def test_missing_names_and_hgvs(): ] +def test_assert_uniqueness(): + uniqueness_columns = ['letter', 'number'] + target_column = 'target' + df = pd.DataFrame([ + ['A', 1, 'not important', 'something'], + ['A', 1, 'not important', 'something'], + ['B', 2, 'not important', 'something else'] + ], columns=uniqueness_columns + ['not important column', target_column]) + result_df = assert_uniqueness(df, uniqueness_columns, target_column, 'failure') + assert result_df.equals(pd.DataFrame([ + ['A', 1, 'something'], + ['B', 2, 'something else'] + ], columns=uniqueness_columns + [target_column])) + + +def test_assert_uniqueness_failure(): + uniqueness_columns = ['letter', 'number'] + target_column = 'target' + df = pd.DataFrame([ + ['A', 1, 'something'], + ['A', 1, 'something else'], + ['B', 2, 'something'] + ], columns=uniqueness_columns + [target_column]) + with pytest.raises(AssertionError): + assert_uniqueness(df, uniqueness_columns, target_column, 'failure') + + def test_annotate_genes_with_transcripts(): """Tests annotation with genes and transcripts, using multiple sources (HGNC, gene symbol, RefSeq transcript)""" variants = pd.DataFrame([ From 2d14a3e59972e3e88610de54f74d5fa36f79ac27 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 6 Dec 2023 12:49:07 +0000 Subject: [PATCH 5/9] add flags to scripts --- bin/consequence_prediction/run_repeat_expansion_variants.py | 6 +++++- bin/consequence_prediction/run_structural_variants.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/consequence_prediction/run_repeat_expansion_variants.py b/bin/consequence_prediction/run_repeat_expansion_variants.py index a1ef8cdb..078c8a08 100755 --- a/bin/consequence_prediction/run_repeat_expansion_variants.py +++ b/bin/consequence_prediction/run_repeat_expansion_variants.py @@ -9,6 +9,10 @@ '--clinvar-xml', required=True, help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)' ) +parser.add_argument( + '--include-transcripts', required=False, action='store_true', + help='Whether to include transcript IDs along with consequence terms' +) parser.add_argument( '--output-consequences', required=True, help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.' @@ -18,4 +22,4 @@ help='File to output full dataframe for subsequent analysis and debugging.' ) args = parser.parse_args() -pipeline.main(args.clinvar_xml, args.output_consequences, args.output_dataframe) +pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences, args.output_dataframe) diff --git a/bin/consequence_prediction/run_structural_variants.py b/bin/consequence_prediction/run_structural_variants.py index e58cf57f..3ef5093c 100755 --- a/bin/consequence_prediction/run_structural_variants.py +++ b/bin/consequence_prediction/run_structural_variants.py @@ -9,10 +9,14 @@ '--clinvar-xml', required=True, help='ClinVar XML dump file (ClinVarFullRelease_00-latest.xml.gz)' ) +parser.add_argument( + '--include-transcripts', required=False, action='store_true', + help='Whether to include transcript IDs along with consequence terms' +) parser.add_argument( '--output-consequences', required=True, help='File to output functional consequences to. Format is compatible with the main VEP mapping pipeline.' ) args = parser.parse_args() -pipeline.main(args.clinvar_xml, args.output_consequences) +pipeline.main(args.clinvar_xml, args.include_transcripts, args.output_consequences) From db26e08d0d704a558cafe63da05774238910d8d3 Mon Sep 17 00:00:00 2001 From: April Shen Date: Thu, 7 Dec 2023 15:41:46 +0000 Subject: [PATCH 6/9] WIP for transcripts in annotation --- bin/evaluation/map_genes.py | 3 ++- cmat/output_generation/annotated_clinvar.py | 1 + cmat/output_generation/consequence_type.py | 1 + pipelines/annotation_pipeline.nf | 17 +++++++++++------ 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/bin/evaluation/map_genes.py b/bin/evaluation/map_genes.py index f74c82e2..3b89c085 100644 --- a/bin/evaluation/map_genes.py +++ b/bin/evaluation/map_genes.py @@ -53,7 +53,8 @@ def load_clinvar_data(clinvar_xml): def main(clinvar_xml, output_file): """Load ClinVar XML, map to Ensembl gene IDs, and dump results to TSV.""" variants = load_clinvar_data(clinvar_xml) - annotated_variants = annotate_ensembl_gene_info(variants) + # Don't include transcripts for evaluation + annotated_variants = annotate_ensembl_gene_info(variants, include_transcripts=False) annotated_variants[['RCVaccession', 'EnsemblGeneID']].to_csv(output_file, sep='\t', index=False, header=False) diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py index bba3bb42..7c05dd20 100644 --- a/cmat/output_generation/annotated_clinvar.py +++ b/cmat/output_generation/annotated_clinvar.py @@ -91,6 +91,7 @@ def annotate(self, record): self.overall_counts['both_measure_and_trait'] += 1 def annotate_and_count_measure(self, record): + # TODO include transcript if present in variant_to_gene_mappings consequence_types, variant_category = get_consequence_types(record.measure, self.variant_to_gene_mappings) record.measure.add_ensembl_annotations(consequence_types) diff --git a/cmat/output_generation/consequence_type.py b/cmat/output_generation/consequence_type.py index 48ad3bb8..30ae58a4 100644 --- a/cmat/output_generation/consequence_type.py +++ b/cmat/output_generation/consequence_type.py @@ -15,6 +15,7 @@ def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term): def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None): + # TODO adapt for transcripts if present """ Return a dictionary of consequence information extracted from the given file. If consequence_type_dict is provided then the information will be merge into this dictionary. diff --git a/pipelines/annotation_pipeline.nf b/pipelines/annotation_pipeline.nf index 60d38f85..1fc9fb48 100644 --- a/pipelines/annotation_pipeline.nf +++ b/pipelines/annotation_pipeline.nf @@ -8,11 +8,12 @@ def helpMessage() { Generate ClinVar evidence strings for Open Targets, or annotated ClinVar XML. Params: - --output_dir Directory for output - --schema Open Targets JSON schema version (optional, will output XML if omitted) - --clinvar ClinVar XML file (optional, will download latest if omitted) - --mappings Trait mappings file (optional, will use a default path if omitted) - --evaluate Whether to run evaluation or not (default false) + --output_dir Directory for output + --schema Open Targets JSON schema version (optional, will output XML if omitted) + --clinvar ClinVar XML file (optional, will download latest if omitted) + --mappings Trait mappings file (optional, will use a default path if omitted) + --include_transcripts Whether to include transcripts in consequences (default false) + --evaluate Whether to run evaluation or not (default false) """ } @@ -21,6 +22,7 @@ params.output_dir = null params.schema = null params.clinvar = null params.mappings = '${BATCH_ROOT_BASE}/manual_curation/latest_mappings.tsv' +params.include_transcripts = false params.evaluate = false if (params.help) { @@ -31,7 +33,7 @@ if (!params.output_dir) { } batchRoot = params.output_dir codeRoot = "${projectDir}/.." - +includeTranscriptsFlag = params.include_transcripts ? "--include-transcripts" : "" /* * Main workflow. @@ -135,6 +137,7 @@ process runSnpIndel { -N 200 `# Number of records (lines) per worker` \ --tmpdir . `# Store temporary files in the current directory to avoid /tmp overflow` \ \${PYTHON_BIN} "${codeRoot}/cmat/consequence_prediction/snp_indel_variants/pipeline.py" \ + ${includeTranscriptsFlag} \ | sort -u > consequences_snp.tsv """ } @@ -161,6 +164,7 @@ process runRepeat { """ \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_repeat_expansion_variants.py \ --clinvar-xml ${clinvarXml} \ + ${includeTranscriptsFlag} \ --output-consequences consequences_repeat.tsv # create an empty file if nothing generated @@ -191,6 +195,7 @@ process runStructural { """ \${PYTHON_BIN} ${codeRoot}/bin/consequence_prediction/run_structural_variants.py \ --clinvar-xml ${clinvarXml} \ + ${includeTranscriptsFlag} \ --output-consequences consequences_structural.tsv # create an empty file if nothing generated From 7716caa6416440cbdedcb05dbd0887097a1daa96 Mon Sep 17 00:00:00 2001 From: April Shen Date: Mon, 11 Dec 2023 14:29:09 +0000 Subject: [PATCH 7/9] add transcript id to annotated xml output --- cmat/output_generation/annotated_clinvar.py | 11 ++++++++--- cmat/output_generation/consequence_type.py | 17 +++++++++++------ .../expected_annotation_output.xml.gz | Bin 2167 -> 2174 bytes 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py index 7c05dd20..3c17afe5 100644 --- a/cmat/output_generation/annotated_clinvar.py +++ b/cmat/output_generation/annotated_clinvar.py @@ -91,7 +91,6 @@ def annotate(self, record): self.overall_counts['both_measure_and_trait'] += 1 def annotate_and_count_measure(self, record): - # TODO include transcript if present in variant_to_gene_mappings consequence_types, variant_category = get_consequence_types(record.measure, self.variant_to_gene_mappings) record.measure.add_ensembl_annotations(consequence_types) @@ -242,8 +241,14 @@ def add_ensembl_annotations(self, consequences): attribute_elt.text = consequence_attributes.so_term.so_name.replace('_', ' ') so_elt = ET.Element('XRef', attrib={'ID': self.format_so_term(consequence_attributes.so_term), 'DB': 'Sequence Ontology'}) - ensembl_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_gene_id, 'DB': 'Ensembl'}) - attr_set_elt.extend((attribute_elt, so_elt, ensembl_elt)) + ensembl_gene_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_gene_id, + 'DB': 'Ensembl Gene'}) + attr_set_elt.extend((attribute_elt, so_elt, ensembl_gene_elt)) + # Add transcript if present + if consequence_attributes.ensembl_transcript_id: + ensembl_transcript_elt = ET.Element('XRef', attrib={'ID': consequence_attributes.ensembl_transcript_id, + 'DB': 'Ensembl Transcript'}) + attr_set_elt.append(ensembl_transcript_elt) consequence_elts.append(attr_set_elt) self.measure_xml.extend(consequence_elts) diff --git a/cmat/output_generation/consequence_type.py b/cmat/output_generation/consequence_type.py index 30ae58a4..2b4097ff 100644 --- a/cmat/output_generation/consequence_type.py +++ b/cmat/output_generation/consequence_type.py @@ -10,15 +10,14 @@ logger = logging.getLogger(__package__) -def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term): - consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term))) +def process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term, ensembl_transcript_id=None): + consequence_type_dict[variant_id].append(ConsequenceType(ensembl_gene_id, SoTerm(so_term), ensembl_transcript_id)) def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None): - # TODO adapt for transcripts if present """ Return a dictionary of consequence information extracted from the given file. - If consequence_type_dict is provided then the information will be merge into this dictionary. + If consequence_type_dict is provided then the information will be merged into this dictionary. """ logger.info('Loading mapping rs -> ENSG/SOterms') if consequence_type_dict is None: @@ -41,7 +40,12 @@ def process_consequence_type_file(snp_2_gene_file, consequence_type_dict=None): logger.warning('Skip line with missing gene ID: {}'.format(line)) continue - process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term) + # Include transcript if present + if len(line_list) >= 5: + ensembl_transcript_id = line_list[4] + process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term, ensembl_transcript_id) + else: + process_gene(consequence_type_dict, variant_id, ensembl_gene_id, so_term) logger.info('{} rs->ENSG/SOterms mappings loaded'.format(len(consequence_type_dict))) return consequence_type_dict @@ -112,9 +116,10 @@ class ConsequenceType: with relationship to ensembl gene IDs and SO terms """ - def __init__(self, ensembl_gene_id, so_term): + def __init__(self, ensembl_gene_id, so_term, ensembl_transcript_id=None): self.ensembl_gene_id = ensembl_gene_id self.so_term = so_term + self.ensembl_transcript_id = ensembl_transcript_id def __eq__(self, other): return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ diff --git a/tests/output_generation/resources/expected_annotation_output.xml.gz b/tests/output_generation/resources/expected_annotation_output.xml.gz index 11e586b44c9fb6ff121ab487ea3885e87cba7fdd..fd2b56cd58246150388794d5aebdff3ae8a51947 100644 GIT binary patch literal 2174 zcmV-^2!Zz>iwFp59Cu{`|8!+@bYE|EbZ~WaE_iKh0Nq;cZ{j!;|Gs~Pm6IOb=}Hpk z9UkDs0?SrgXdhtv+kKmQ@K6>V-?6t~%1^`G{Ps9cf1r>5uNa_LKy~O~{NeG;kSN zb|g*J^o9z*nuO-N2x#xOK_DRzK$F(D#P3LJ#-f&@JU%|kkGdSjD@9XPto%8?nt5xo zaT`A6vOEH-62C>miMt_5t)UtXEuZYiq?o{9M7yCG z66X&R3dSDC!WqMV9ckLXRpG}}v6HiS5pQ%`I=%20aFmo>=9GA0yqqzYZIiCIjX^`! zRw|==aK7gTGk-<>d-xA5kxivs9(?kc)6)>SrER>}XC(Gr6vg3#za&e}Tp%tu3`hc` z5nt22^8X^i4n;0ohbsb7+ln}y;ZaTlYVTc>DGb}ePd5}|AzJzg-_fN7t`|W3K$e%Z z1T%9tQBZ2fX<~onQ_zeFdgOz9s#!!lpCoARehX>1*6rJzj9wjtN0k#w6!QZ^*Sgk$lAt++7y-L zS#19Z1H*u9Af9gBU|aCKiFq1{$x`Z0#&_7%ZPk#iU+3fD*s_0Xc`{^5ZT_DX`)?op z8p-oGvMi?;uYPMQA~!**QWe|oyEM!K<7RN#o%HXx&Z;A0+-mA})7EX=nz#B8Ko!Co zpXHga#+hY`>PT19(VgWutjsPSN4K*Jl$DkX1k+Ntq^amv<8rP}z4-4$rJaFF160^H zo3D=wJPvgb8=9`-YGUrMUAh98B9eOOP=;pYYr1<%UNU}56|zk$;I~Bld;dO(AB}sH z;S8D!k98y#Z%HK{-g=)@p+e0GH@;WhCaslh*rxUtCLV!jo4BdgZ;=X)6TmYgj|k;v zE!e>>c{?47v?JsM8CdZ%iJscC|m+Fna4^A5UBLnteaEZhNo!9>0s&s>pi_F)}MOE=3BR(xU(;uJaBEfxWZ(%w%^cGSrif*~6CD6X6-~MNQ z{L*s{AZG2)35eN>mO%K!@tqH*uor$+JmNmTNm~Hjv~9hf0@#?t?gTDysylN#V*ax z;xAR-!p)bef1(L^J@s$0Aj^UgUlGO#SYin#_&;Q|=_|$76H+0jUnsS|WBm4uFyAZg zL%Ysbupx${d(E7150(3+m_qzMY_kY<9ceV}uc3D?N6LIwrm#`}L)LF%Lxqea&}P?A z&_89jggn*79m$buTVt7b`w4SnINlYVIUfik@py?DxZN2;gZAyaFc*)w2^4NIv+1jTlzf)VKM^iDD4lmE1y*^75 z*anOXe@RhQk!7E{C_BC?ga$~D7iww23)D6C*7jFnKcGutn>J!lGS%dvBVe5O7xpP$}W zU<)>hlguQ#V>!BkoxBNIE@qRP${MkrmhTsY7kfW(pE#V%M%lKvjHa2d9@2E@qr)vz z-#L_?E(Z?Y54`?MiE5tRYuE2#!!XQzJM;H$ClPok9NMF53lA zf z9eCiHbKv!g8VvA;{ZGTI#%TKa{cwtAUnYa;^>}!J=3j1x&!arbG`4Qx`b={Kw5Gy6 zW64jm9}DOZ!I_W*>pfk^Yb2a~3>fvM1LQ9cFLNCm%IJx}PvPmu(#K)WS%@E|3O0_W87rb<1gD z{UvGttbo|33hWq`ZsxTCgbJCWN@-e!;o=C+U4)&gALnhXW4unYz&reK&5L~2f4G!} zjDKw*9oHCTp9#~{RAjg!p{3uGC?`KC>AuUpZYVxJdBLcwGL|8!+@bYE|EbZ~WaE_iKh0Nq+!Z{o-jexF~_@=8a$+Qj!O z2Usz{Fwq2(15D1&Q)EmBtTkP7qhf7;P33bWR4MKuD!VV|GAKiEDDGsok;2EZ9;8MKo z2&$}VmoogS0-En4pncc|fq*~&Nm}0$zay+EjariQ^zCf@?%v(Fy)3N+{RKqHZO3tuF`MkM}PT!oKPv|UJE;FCu; zmcAv+dw33fuNcZ7?-kLMMVJ=QwVTk}IP`FmRC-vKswtpy`2DUU^vAuq0IK(a(VZKE z>fLt;fh8Vr3^v30cn|1fh7w&3z@nOJ>za@ni3$1~ElYE$m$H7T=9B#x7ZXY*Ul$mE zkWeu87#7AD{_6GUgm{!C`^Wlz+_ixOSR`h38}6{dyK)HOO8u6z%0}an3F(Aj8RB}U0dSj zN`7?)rQKlTxoHSEmg?A=q6xfyl*ayIOL6WA&X2;rwgqV=krz?8ecxT6hMNFzmZ35k zcg)*_3MnYVv@FPVX(Zj{k(3Q_alzRr<&`r^AUOsYk*yctkOE5d$g{Q-U3Pe7Sr)4v zVPF{G4aC!(8*B@nw=qj2Fm3D%Hid`z{Hyz_=Y;btnCMrn78|iqdj4%ds>|Y0X=G2%s8agU|BJ z*Wk=DMRkPh>FD0HZCYlRkE6TU1?@Yi(rl)OacoGN6SmcwuI_$&WDiyw`9li>`S z3y*dL8gFqe9?rc_>QJHPgd5+hZobM-1(yPbgmDIL&??L9>N=YTIXccID)na=I-;<(E%W z?gFYH^hDt{IZ)H8sVTSiS-+{8(|cLf4OQjTR4Xls$U5jI4WX=DX5kL#3nrr4;#ZtI zM;WK&sL~BwE;8OW7uCgw6#Jy;On-cui#YeCy@g?0&|3(ND7xjM7DxM*e*2&C@k`I% zgP65HM<8Y^S{&gI$9FoI!d}Xo;^FuCZQ258hGl8Zwc;fwbL|AjOH|I~agYjKw4)dt zeC_lNP=lW_616j0|DT0D12GPv=h_Bt+z&~Dzi!!$hS1&kaAbz(;)K&E9Ju`npwq+8 z?4|`ZNiotpiTCiFvqCq7hC#U6@mY*t_)OoLamvji{mYbYY;Di2aaEZSja`zR#b2ww zsW`7y|40+?cIw||L6!xha*Zj)V2LG|;Qx@-rf(Em&q#%sex=m@jjNO@Sx=u@76ed1X z@~;1ZCya~R+FlUADKnj&3&;(?CB&utaKE8unia& z{^FvlBGWo{QFeG$#=O1TEW+SbaLekb$)6kkiGAG;*$zs8b zIbzune&5UfKVcwQrX^bn7f70-n@U3<6@&+YlrPu|BvaQM<+vrvI_N;(V`W7zy_;zu z%d7X*IJ;vuWyiMlY{?B9J;M3pXwq+~w&s`_Eqkt4)P~69i;YiR?u7{p&)5y>5qdR1 zvv3=GEF89rF^dIO%2i!=+S1j47gn(2#!9H!neCUm9yA4(XE;?7RtCE@qS4+8Vx|mhTt17kfW(pE#V%M%lJEb;rn84@tW7Uv7{}Czk_z zuLj=!eMCLQR*Lm|imvNMzLVLjI|zXC6`WMP!^ubHvFp?7mChi4XP0dNsPCnOlrSBJ z-Cy4J7xdr%a%POZzi@z(9a+ZV245fw6i9m}?U%ZaZzcn_0Tjhnl}zDusP4c$*PH^c zSCn6XH|&2JUSE!;pWhFsX!d0?nBI(s7ij+FcK9;NgCt{VrqY~b4uCckw-+q=Y4&3Q z?ISo55@)^VOL&8XlTQGn-gJQc<^DacZ9xS+67%U?rs~6k4@n|?KvOryDBr}%NhhPd z1U_fYQI_f5KpL_X2>Iu70sH`5Z?S;t37KUZhpZ4$oX`uT!j648<13nJJBs$2jDJ!< zRH;I-byG9)vH(K4Oi`ya%))SS1V=7RkCl$oHr63t+bQr4KishTp0yq>At7ZSS_p^b tMXEAknwp9XS0uFbQxfUuw Date: Mon, 11 Dec 2023 14:39:49 +0000 Subject: [PATCH 8/9] update documentation --- README.md | 1 + .../repeat_expansion_variants/pipeline.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7e9e5864..87c0676b 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ nextflow run ${CODE_ROOT}/pipelines/annotation_pipeline.nf \ --mappings ${LATEST_MAPPINGS} \ -resume ``` +You can use the `--include_transcripts` flag to also include transcript annotations with the functional consequences. By default, the pipeline will download and annotate the latest ClinVar XML dump from [FTP](https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/). If you want to run it on an existing XML file, you can pass it via the `--clinvar` flag. diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py index 97e83b08..6002fcb5 100644 --- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py +++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py @@ -249,7 +249,7 @@ def main(clinvar_xml, include_transcripts, output_consequences=None, output_data Args: clinvar_xml: filepath to the ClinVar XML file. - include_transcripts: + include_transcripts: whether to include transcript IDs along with consequence terms. output_consequences: filepath to the output file with variant consequences. The file uses a 6-column format compatible with the VEP mapping pipeline (see /consequence_prediction/README.md). output_dataframe: filepath to the output file with the full dataframe used in the analysis. This will contain From 4f8d1b15ffc24baa23e3cba340f9d0f2d660629c Mon Sep 17 00:00:00 2001 From: April Shen Date: Tue, 19 Dec 2023 08:36:27 +0000 Subject: [PATCH 9/9] update log statement --- .../repeat_expansion_variants/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py index 6002fcb5..9c2bff8f 100644 --- a/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py +++ b/cmat/consequence_prediction/repeat_expansion_variants/pipeline.py @@ -274,7 +274,7 @@ def main(clinvar_xml, include_transcripts, output_consequences=None, output_data logger.info('No variants to process') return None - logger.info('Match each record to Ensembl gene ID and name') + logger.info(f'Match each record to Ensembl gene ID{", transcript ID," if include_transcripts else ""} and gene name') variants = annotate_ensembl_gene_info(variants, include_transcripts) logger.info('Determine variant type and whether the record is complete')