Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 435: Filter gene-related disorder submission from curation and evidence generation #443

Merged
merged 3 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions bin/trait_mapping/parse_traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Parse traits from ClinVar XML")
parser.add_argument("-i", dest="input_filepath", required=True,
help="ClinVar XML dump file. One record per line.")
parser.add_argument("-i", dest="input_filepath", required=True, help="ClinVar XML dump file.")
parser.add_argument("-o", dest="output_traits_filepath", required=True,
help="path to output file for all traits for downstream processing")
parser.add_argument("-u", dest="output_for_platform", required=False,
Expand Down
7 changes: 6 additions & 1 deletion cmat/clinvar_xml_io/clinvar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from datetime import date

from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes
from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes, iterate_cvs_from_xml

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand All @@ -22,6 +23,10 @@ def __iter__(self):
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
yield ClinVarReferenceRecord(rcv, self.xsd_version)

def iter_cvs(self):
for cvs in iterate_cvs_from_xml(self.clinvar_xml):
yield ClinVarSet(cvs, self.xsd_version)

def get_xsd_version(self):
# For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md
if 'xsi:noNamespaceSchemaLocation' in self.header_attr:
Expand Down
1 change: 0 additions & 1 deletion cmat/clinvar_xml_io/clinvar_reference_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from functools import cached_property

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification

from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements

Expand Down
4 changes: 2 additions & 2 deletions cmat/clinvar_xml_io/clinvar_set.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements

Expand All @@ -12,7 +12,7 @@ def __init__(self, cvs_xml, xsd_version):
self.cvs_xml = cvs_xml

rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
self.rcv = ClinVarRecord(rcv_elem, xsd_version)
self.rcv = ClinVarReferenceRecord(rcv_elem, xsd_version)

scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]
Expand Down
2 changes: 1 addition & 1 deletion cmat/clinvar_xml_io/clinvar_submitted_record.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from functools import cached_property

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element

logger = logging.getLogger(__name__)
Expand Down
12 changes: 12 additions & 0 deletions cmat/clinvar_xml_io/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Filtering functions that can be used in multiple pipelines.

# Identified as problematic submissions, e.g. too many unmappable trait names.
submission_names_to_exclude = ['SUB14299258']


def filter_by_submission_name(clinvar_set):
"""Return False (i.e. filter out) if every submitted record in the set has submission_name in the exclusion list."""
for submitted_record in clinvar_set.scvs:
if submitted_record.submission_name not in submission_names_to_exclude:
return True
return False
26 changes: 17 additions & 9 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

Expand Down Expand Up @@ -64,7 +65,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

logger.info('Processing ClinVar records')
i = -1
for clinvar_record in ClinVarDataset(clinvar_xml):
dataset = ClinVarDataset(clinvar_xml)
for clinvar_set in dataset.iter_cvs():
# If start & end provided, only process records in the range [start, end)
i += 1
if start and i < start:
Expand All @@ -78,7 +80,13 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

# Catch any exceptions for this record so we can continue processing.
try:
# Failure mode 0 (skip). Contains multiple clinical classification annotations.
# Failure mode 1 (fatal). Record is only supported by submissions deemed to be unusable.
if not filter_by_submission_name(clinvar_set):
report.clinvar_fatal_excluded_submission += 1
continue
clinvar_record = clinvar_set.rcv

# Failure mode 2 (skip). Contains multiple clinical classification annotations.
# This is new as of V2 of the ClinVar XSD and should definitely be supported at some point,
# but as it can cause parsing complications we catch these cases first.
# See GH issue for context: https://github.com/EBIvariation/CMAT/issues/396
Expand All @@ -87,18 +95,18 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
report.clinvar_skip_multiple_clinical_classifications += 1
continue

# Failure mode 1 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# Failure mode 3 (fatal). A ClinVar record contains no valid traits (traits which have at least one valid,
# potentially mappable name).
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue
# Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# Failure mode 4 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# submissions being flagged.
if not clinvar_record.valid_clinical_significances:
report.clinvar_fatal_no_clinical_significance += 1
continue

# Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
# Failure mode 5 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue
Expand All @@ -110,7 +118,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

# Failure mode 4 (skip). No functional consequences are available.
# Failure mode 6 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue
Expand All @@ -121,7 +129,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# Failure mode 7 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# suitable EFO mappings were found in the database. This will still generate an evidence string, but is
# tracked as a failure so we can continue to measure mapping coverage.
if not contains_mapping(grouped_diseases):
Expand Down Expand Up @@ -175,8 +183,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
except Exception as e:
# We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
# records and printing the report.
logger.error(f'Problem generating evidence for {clinvar_record.accession}')
logger.error(f'Error: {e}')
logger.error(f'Problem generating evidence for {clinvar_set.rcv.accession}')
logger.error(f'Error: {repr(e)}')
exception_raised = True
continue

Expand Down
5 changes: 4 additions & 1 deletion cmat/output_generation/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(self, trait_mappings=None, consequence_mappings=None):
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_fatal_no_clinical_significance = 0
self.clinvar_fatal_excluded_submission = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
Expand Down Expand Up @@ -88,7 +89,8 @@ def load_from_file(self, filename):

def compute_record_tallies(self):
"""Compute tallies of records fatal/skipped/done based on the more granular counts."""
self.clinvar_fatal = self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance
self.clinvar_fatal = (self.clinvar_fatal_no_valid_traits + self.clinvar_fatal_no_clinical_significance +
self.clinvar_fatal_excluded_submission)
self.clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string +
self.clinvar_skip_multiple_clinical_classifications)
Expand All @@ -115,6 +117,7 @@ def print_report(self):
Fatal: Cannot produce evidence\t{self.clinvar_fatal}
No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
No clinical significance\t{self.clinvar_fatal_no_clinical_significance}
Excluded submissions\t{self.clinvar_fatal_excluded_submission}
Skipped: Can be rescued by future improvements\t{self.clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Expand Down
9 changes: 7 additions & 2 deletions cmat/trait_mapping/trait_names_parsing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import Counter

from cmat import clinvar_xml_io
from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.filtering import filter_by_submission_name
from cmat.trait_mapping.trait import Trait


Expand All @@ -27,7 +28,11 @@ def parse_trait_names(filepath: str) -> list:
# Their curation is of highest importance regardless of how many records they are actually associated with.
nt_expansion_traits = set()

for clinvar_record in clinvar_xml_io.ClinVarDataset(filepath):
dataset = ClinVarDataset(filepath)
for clinvar_set in dataset.iter_cvs():
if not filter_by_submission_name(clinvar_set):
continue
clinvar_record = clinvar_set.rcv
trait_names_and_ids = set((trait.preferred_or_other_valid_name.lower(), trait.identifier)
for trait in clinvar_record.traits_with_valid_names)
for trait_tuple in trait_names_and_ids:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ chédiak-higashi syndrome http://www.orpha.net/ORDO/Orphanet_167 chédiak-higash
cobalamin c disease http://purl.obolibrary.org/obo/MONDO_0010184 methylmalonic aciduria and homocystinuria type cblC
cobalamin c disease http://www.orpha.net/ORDO/Orphanet_26 Methylmalonic acidemia with homocystinuria
cobalamin c disease http://www.orpha.net/ORDO/Orphanet_79282 Methylmalonic acidemia with homocystinuria, type cblC
coffin-siris syndrome 1 http://purl.obolibrary.org/obo/MONDO_0015452 Coffin-Siris syndrome
coffin-siris syndrome 1 http://purl.obolibrary.org/obo/MONDO_0007617 coffin-siris syndrome 1
cog1 congenital disorder of glycosylation http://purl.obolibrary.org/obo/MONDO_0012637 COG1-congenital disorder of glycosylation
cog7 congenital disorder of glycosylation http://purl.obolibrary.org/obo/MONDO_0012118 COG7-congenital disorder of glycosylation
cohen syndrome http://purl.obolibrary.org/obo/MONDO_0008999 cohen syndrome
Expand Down Expand Up @@ -278,7 +278,7 @@ hepatoencephalopathy due to combined oxidative phosphorylation defect type 1 htt
hereditary breast ovarian cancer syndrome http://purl.obolibrary.org/obo/MONDO_0003582 hereditary breast ovarian cancer syndrome
hereditary cancer-predisposing syndrome http://purl.obolibrary.org/obo/MONDO_0015356 hereditary neoplastic syndrome
hereditary diffuse gastric adenocarcinoma http://purl.obolibrary.org/obo/MONDO_0007648 hereditary diffuse gastric adenocarcinoma
hereditary diffuse leukoencephalopathy with spheroids http://www.orpha.net/ORDO/Orphanet_313808 Hereditary diffuse leukoencephalopathy with axonal spheroids and pigmented glia
hereditary diffuse leukoencephalopathy with spheroids http://www.orpha.net/ORDO/Orphanet_313808 Adult-onset leukoencephalopathy with axonal spheroids and pigmented glia
hereditary hemorrhagic telangiectasia http://purl.obolibrary.org/obo/MONDO_0019180 hereditary hemorrhagic telangiectasia
hereditary insensitivity to pain with anhidrosis http://purl.obolibrary.org/obo/MONDO_0009746 hereditary sensory and autonomic neuropathy type 4
hereditary nonpolyposis colorectal neoplasms http://www.ebi.ac.uk/efo/EFO_0009911 hereditary nonpolyposis colorectal carcinoma
Expand Down Expand Up @@ -338,7 +338,7 @@ inflammatory skin and bowel disease, neonatal, 1 http://purl.obolibrary.org/obo/
intellectual developmental disorder, autosomal dominant 64 http://purl.obolibrary.org/obo/MONDO_0030934 intellectual developmental disorder, autosomal dominant 64
intellectual disability http://purl.obolibrary.org/obo/HP_0001249 intellectual disability
intellectual disability, autosomal dominant 1 http://purl.obolibrary.org/obo/MONDO_0016459 2q23.1 microdeletion syndrome
intellectual disability, autosomal dominant 20 http://purl.obolibrary.org/obo/MONDO_0016456 5q14.3 microdeletion syndrome
intellectual disability, autosomal dominant 20 http://purl.obolibrary.org/obo/MONDO_0013266 intellectual disability, autosomal dominant 20
intellectual disability, autosomal dominant 5 http://purl.obolibrary.org/obo/MONDO_0012960 intellectual disability, autosomal dominant 5
intellectual disability, autosomal dominant 6 http://purl.obolibrary.org/obo/MONDO_0100172 intellectual disability, autosomal dominant
intellectual disability, autosomal dominant 9 http://purl.obolibrary.org/obo/MONDO_0013656 intellectual disability, autosomal dominant 9
Expand Down Expand Up @@ -508,7 +508,7 @@ retinitis pigmentosa-deafness syndrome http://purl.obolibrary.org/obo/MONDO_0019
retinoblastoma http://purl.obolibrary.org/obo/MONDO_0008380 retinoblastoma
rett syndrome http://purl.obolibrary.org/obo/MONDO_0010726 rett syndrome
rett syndrome, congenital variant http://purl.obolibrary.org/obo/MONDO_0010726 Rett syndrome
rhabdoid tumor predisposition syndrome 2 http://purl.obolibrary.org/obo/MONDO_0016473 familial rhabdoid tumor
rhabdoid tumor predisposition syndrome 2 http://purl.obolibrary.org/obo/MONDO_0013224 rhabdoid tumor predisposition syndrome 2
rod-cone dystrophy http://www.orpha.net/ORDO/Orphanet_1872 Cone rod dystrophy
rubinstein-taybi syndrome http://purl.obolibrary.org/obo/MONDO_0019188 rubinstein-taybi syndrome
ryr1-related disorders http://www.ebi.ac.uk/efo/EFO_0009143 ryr1-related disorders
Expand Down
Loading
Loading