Skip to content

Commit

Permalink
Merge pull request #135 from tskir/eva-2090-ditch-haplotype-support
Browse files Browse the repository at this point in the history
EVA-2090 — Ditch haplotype and genotype support
  • Loading branch information
tskir authored Jul 24, 2020
2 parents b527112 + 30cec20 commit 3966508
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 19 deletions.
4 changes: 2 additions & 2 deletions clinvar-variant-types/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ All graphs in this section were generated from the file `ClinVarFullRelease_2020
- **CompoundHeterozygote.** Presumably this should include exactly two variants which are _trans_ phased and interpreted together.
- **Diplotype.** Similar, but at least one of the _trans_ phased alleles includes a haplotype. An example of this would be three variants located on one copy of the gene, and one variant in the second one, all interpreted together.

The most common case is the MeasureSet/Variant one, accounting for 1114689 out of 1115169 RCV records (as of the date when this report was compiled), or 99.96%.
As of July 2020, the most common case is the MeasureSet/Variant one, accounting for 1114689 out of 1117817 RCV records, or >99.7%. **Currently, this is the only type being processed by this pipeline.**

### Clinical significance

Expand All @@ -70,4 +70,4 @@ The distribution of records by star rating is:

![](mode-of-inheritance.png)

Only a small fraction of all records specify their mode of inheritance: 35,009 out of 1,114,689, or about 3%.
Only a small fraction of all records specify their mode of inheritance: 35,009 out of 1,114,689, or about 3%.
47 changes: 30 additions & 17 deletions eva_cttv_pipeline/evidence_string_generation/clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@


class ClinvarRecord(UserDict):
"""
Class of which instances hold data on individual clinvar records. Subclass of UserDict rather
than dict in order to use attributes
"""Instances of this class hold data on individual ClinVar records. It is a subclass of UserDict rather than a regular
dict in order to use attributes.
"""

# A score for the review status of the assigned clinical significance ranges from 0 to 4 and corresponds to the
# number of gold stars displayed on ClinVar website. See details here:
# number of "gold stars" displayed on ClinVar website. See details here:
# https://www.ncbi.nlm.nih.gov/clinvar/docs/details/#review_status
score_map = {
"CRITERIA_PROVIDED_SINGLE_SUBMITTER": 1,
Expand All @@ -20,28 +19,40 @@ class ClinvarRecord(UserDict):
}

def __init__(self, cellbase_dict):
"""Initialise a ClinVar record object from JSON data. See /clinvar-variant-types/README.md for the in-depth
explanation of ClinVar data model. See also issue https://github.com/EBIvariation/eva-opentargets/issues/127
for the most recent discussions on changing support of different ClinVar record types.
"""
UserDict.__init__(self, cellbase_dict)
if "measureSet" in self.data['referenceClinVarAssertion']:
measure_list = self.data['referenceClinVarAssertion']["measureSet"]["measure"]
elif "measureSet" in self.data['referenceClinVarAssertion']["genotypeSet"]:
if 'measureSet' in self.data['referenceClinVarAssertion']:
# MeasureSet provides information on a variant or a set of variants located on the same chromosomal copy.
if self.data['referenceClinVarAssertion']['measureSet']['type'] == 'Variant':
# The measure "list" actually only contains a single variant. This is the only case we are currently
# supporting. As of July 2020, it accounts for >99.7% of all ClinVar records.
measure_list = self.data['referenceClinVarAssertion']['measureSet']['measure']
else:
# Uncommon record types, such as "Haplotype", "Phase unknown", or "Distinct chromosomes".
# Not currently supported.
measure_list = []
elif 'measureSet' in self.data['referenceClinVarAssertion']['genotypeSet']:
# The record contains a GenotypeSet, a rare subtype which contains an assertion about a group of variants
# from several chromosome copies. This could be either a CompoundHeterozygote or a Diplotype, and those
# types are currently not processed.
measure_list = []
for measure_set in self.data['referenceClinVarAssertion']["genotypeSet"]["measureSet"]:
for measure in measure_set["measure"]:
measure_list.append(measure)
else:
raise KeyError()
raise KeyError('ClinVar record contains neither a MeasureSet, nor a GenotypeSet')

self.measures = [ClinvarRecordMeasure(measure_dict, self) for measure_dict in measure_list]

@property
def date(self):
return datetime.utcfromtimestamp(
self.data['referenceClinVarAssertion']['dateLastUpdated'] / 1000).isoformat()
return datetime.utcfromtimestamp(self.data['referenceClinVarAssertion']['dateLastUpdated'] / 1000).isoformat()

@property
def score(self):
"""Returns a score for the review status of the assigned clinical significance. See score_map above. It should
be noted that currently this property is not used, but this might change in the future."""
be noted that currently this property is not used, but this might change in the future.
"""
return self.score_map.get(self.data['referenceClinVarAssertion']['clinicalSignificance']['reviewStatus'], 0)

@property
Expand Down Expand Up @@ -104,8 +115,7 @@ def observed_refs_list(self):

@property
def clinical_significance(self):
return \
self.data['referenceClinVarAssertion']['clinicalSignificance']['description']
return self.data['referenceClinVarAssertion']['clinicalSignificance']['description']

@property
def allele_origins(self):
Expand All @@ -118,6 +128,10 @@ def allele_origins(self):


class ClinvarRecordMeasure(UserDict):
"""This class represents individual ClinVar record "measures". Measures are essentially isolated variants, which
can be combined into MeasureSets (include one or move Measures) or GenotypeSets. For a detailed description of
ClinVar data model, see /clinvar-variant-types/.
"""

def __init__(self, clinvar_measure_dict, clinvar_record):
UserDict.__init__(self, clinvar_measure_dict)
Expand Down Expand Up @@ -195,4 +209,3 @@ def sequence_location_helper(self, attr):
if attr in sequence_location:
return sequence_location[attr]
return None

0 comments on commit 3966508

Please sign in to comment.