Skip to content

Commit

Permalink
add filtering to trait name parsing and evidence string generation
Browse files Browse the repository at this point in the history
  • Loading branch information
apriltuesday committed Sep 5, 2024
1 parent b014d60 commit 4fb0d28
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 24 deletions.
3 changes: 1 addition & 2 deletions bin/trait_mapping/parse_traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Parse traits from ClinVar XML")
parser.add_argument("-i", dest="input_filepath", required=True,
help="ClinVar XML dump file. One record per line.")
parser.add_argument("-i", dest="input_filepath", required=True, help="ClinVar XML dump file.")
parser.add_argument("-o", dest="output_traits_filepath", required=True,
help="path to output file for all traits for downstream processing")
parser.add_argument("-u", dest="output_for_platform", required=False,
Expand Down
12 changes: 12 additions & 0 deletions cmat/clinvar_xml_io/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Filtering functions that can be used in multiple pipelines.

# Identified as problematic submissions, e.g. too many unmappable trait names.
submission_names_to_exclude = ['SUB14299258']


def filter_by_submission_name(clinvar_set):
"""Return False (i.e. filter out) if every submitted record in the set has submission_name in the exclusion list."""
for submitted_record in clinvar_set.scvs:
if submitted_record.submission_name not in submission_names_to_exclude:
return True
return False
25 changes: 16 additions & 9 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

Expand Down Expand Up @@ -64,8 +65,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

logger.info('Processing ClinVar records')
i = -1
# TODO filter here
for clinvar_record in ClinVarDataset(clinvar_xml):
dataset = ClinVarDataset(clinvar_xml)
for clinvar_set in dataset.iter_cvs():
# If start & end provided, only process records in the range [start, end)
i += 1
if start and i < start:
Expand All @@ -79,7 +80,13 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

# Catch any exceptions for this record so we can continue processing.
try:
# Failure mode 0 (skip). Contains multiple clinical classification annotations.
# Failure mode 1 (fatal). Record is only supported by submissions deemed to be unusable.
if not filter_by_submission_name(clinvar_set):
report.clinvar_fatal_excluded_submission += 1
continue
clinvar_record = clinvar_set.rcv

# Failure mode 2 (skip). Contains multiple clinical classification annotations.
# This is new as of V2 of the ClinVar XSD and should definitely be supported at some point,
# but as it can cause parsing complications we catch these cases first.
# See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396
Expand All @@ -88,18 +95,18 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
report.clinvar_skip_multiple_clinical_classifications += 1
continue

# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# Failure mode 3 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# potentially mappable name).
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue
# Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# Failure mode 4 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# submissions being flagged.
if not clinvar_record.valid_clinical_significances:
report.clinvar_fatal_no_clinical_significance += 1
continue

# Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
# Failure mode 5 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue
Expand All @@ -111,7 +118,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

# Failure mode 4 (skip). No functional consequences are available.
# Failure mode 6 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue
Expand All @@ -122,7 +129,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# Failure mode 7 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# suitable EFO mappings were found in the database. This will still generate an evidence string, but is
# tracked as a failure so we can continue to measure mapping coverage.
if not contains_mapping(grouped_diseases):
Expand Down Expand Up @@ -176,7 +183,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
except Exception as e:
# We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
# records and printing the report.
logger.error(f'Problem generating evidence for {clinvar_record.accession}')
logger.error(f'Problem generating evidence for {clinvar_set.rcv.accession}')
logger.error(f'Error: {e}')
exception_raised = True
continue
Expand Down
5 changes: 4 additions & 1 deletion cmat/output_generation/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_fatal_no_clinical_significance = 0
self.clinvar_fatal_excluded_submission = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
Expand Down Expand Up @@ -88,7 +89,8 @@ def load_from_file(self, filename):

def compute_record_tallies(self):
"""Compute tallies of records fatal/skipped/done based on the more granular counts."""
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance
self.clinvar_fatal = (self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance +
self.clinvar_fatal_excluded_submission)
self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string +
self.clinvar_skip_multiple_clinical_classifications)
Expand All @@ -115,6 +117,7 @@ def print_report(self):
Fatal: Cannot produce evidence\t{self.clinvar_fatal}
No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
No clinical significance\t{self.clinvar_fatal_no_clinical_significance}
Excluded submissions\t{self.clinvar_fatal_excluded_submission}
Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Expand Down
14 changes: 2 additions & 12 deletions cmat/trait_mapping/trait_names_parsing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from collections import Counter
from typing import Iterable

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
from cmat.trait_mapping.trait import Trait


Expand Down Expand Up @@ -31,8 +30,7 @@ def parse_trait_names(filepath: str) -> list:

dataset = ClinVarDataset(filepath)
for clinvar_set in dataset.iter_cvs():
# TODO where to put this logic (both the method & the exclusion list)?
if should_exclude_record(clinvar_set, ['SUB14299258']):
if not filter_by_submission_name(clinvar_set):
continue
clinvar_record = clinvar_set.rcv
trait_names_and_ids = set((trait.preferred_or_other_valid_name.lower(), trait.identifier)
Expand All @@ -53,11 +51,3 @@ def parse_trait_names(filepath: str) -> list:
associated_with_nt_expansion=associated_with_nt_expansion))

return traits


def should_exclude_record(clinvar_set: ClinVarSet, names_to_exclude: Iterable) -> bool:
"""Return True if every submitted record in the set has submission_name in the exclusion list."""
for submitted_record in clinvar_set.scvs:
if submitted_record.submission_name not in names_to_exclude:
return False
return True

0 comments on commit 4fb0d28

Please sign in to comment.