Skip to content

Commit

Permalink
feat: cnv translator supports uncertain ranges for g. and m. (#387)
Browse files Browse the repository at this point in the history
Added capability for CNV translator to support `Range` endpoints in SequenceLocations for g. and m. hgvs expressions.
  • Loading branch information
korikuzma authored Apr 10, 2024
1 parent a8925b2 commit 20133b6
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 6 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ extras =
psycopg2-binary
biocommons.seqrepo>=0.5.1
bioutils>=0.5.2
hgvs>=1.4
hgvs@git+https://github.com/biocommons/hgvs@225-uncertain-ranges
requests
dill~=0.3.7
click
Expand Down
50 changes: 47 additions & 3 deletions src/ga4gh/vrs/extras/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Optional, Union
from ga4gh.vrs.dataproxy import create_dataproxy, _DataProxy
from ga4gh.vrs.extras.decorators import lazy_property
from hgvs.location import Interval, SimplePosition, AAPosition, BaseOffsetInterval
import logging
import re

Expand Down Expand Up @@ -438,11 +439,14 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs):
copy_change: Copy change. If not provided, default is efo:0030067 for
deletions and efo:0030070 for duplications
"""
# sv = self._get_parsed_hgvs(hgvs_dup_del_expr)
sv = self.hgvs_tools.parse(hgvs_dup_del_expr)
if not sv:
return None

if sv.type not in {"g", "m"}:
err_msg = "Only 'g' and 'm' reference sequences are supported"
raise ValueError(err_msg)

sv_type = self.hgvs_tools.get_edit_type(sv)
if sv_type not in {"del", "dup"}:
raise ValueError("Must provide a 'del' or 'dup'")
Expand All @@ -454,10 +458,13 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs):
if not refget_accession:
return None

start = self._get_vrs_loc_start_end_val(sv.posedit.pos.start, is_start=True)
end = self._get_vrs_loc_start_end_val(sv.posedit.pos.end, is_start=False)

location = models.SequenceLocation(
sequenceReference=models.SequenceReference(refgetAccession=refget_accession),
start=sv.posedit.pos.start.base - 1,
end=sv.posedit.pos.end.base
start=start,
end=end
)

copies = kwargs.get("copies")
Expand All @@ -472,6 +479,43 @@ def _from_hgvs(self, hgvs_dup_del_expr: str, **kwargs):
cnv =self._post_process_imported_cnv(cnv)
return cnv

@staticmethod
def _get_vrs_loc_start_end_val(
pos: Union[SimplePosition, AAPosition, BaseOffsetInterval, Interval],
is_start: bool = True
) -> Union[int, models.Range]:
"""Get VRS Location start or end value
:param pos: biocommons hgvs location instance for position
:param is_start: ``True`` if ``pos`` represents VRS Sequence Location start.
``False`` if ``pos`` represents VRS Sequence Location end.
:raise ValueError: If unsupported biocommons hgvs location is passed
:return: VRS Location start or end value using inter-residue positions
"""
def _get_pos_value(
position: Optional[int],
do_subtract_1: bool
) -> Optional[int]:
"""Get position value
:param position: Position
:param do_subtract_1: Whether or not we need to subtract 1 for ``position``
:return: Adjusted position value
"""
return position - 1 if do_subtract_1 else position

if isinstance(pos, (SimplePosition, AAPosition, BaseOffsetInterval)):
vrs_loc_pos_val = _get_pos_value(pos.base, is_start)
elif isinstance(pos, Interval):
start_val = _get_pos_value(pos.start.base, is_start and pos.start.base is not None)
end_val = _get_pos_value(pos.end.base, is_start and pos.end.base is not None)
vrs_loc_pos_val = start_val if start_val == end_val else models.Range([start_val, end_val])
else:
err_msg = f"HGVS Location is not supported: {type(pos)}"
raise ValueError(err_msg)

return vrs_loc_pos_val

def _post_process_imported_cnv(self, copy_number):
"""Provide common post-processing for imported Copy Numbers IN-PLACE."""
if self.identify:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000005.9
response:
body:
string: "{\n \"added\": \"2016-08-24T05:17:01Z\",\n \"aliases\": [\n \"GRCh37:5\",\n
\ \"GRCh37:chr5\",\n \"GRCh37.p10:5\",\n \"GRCh37.p10:chr5\",\n \"GRCh37.p11:5\",\n
\ \"GRCh37.p11:chr5\",\n \"GRCh37.p12:5\",\n \"GRCh37.p12:chr5\",\n
\ \"GRCh37.p13:5\",\n \"GRCh37.p13:chr5\",\n \"GRCh37.p2:5\",\n \"GRCh37.p2:chr5\",\n
\ \"GRCh37.p5:5\",\n \"GRCh37.p5:chr5\",\n \"GRCh37.p9:5\",\n \"GRCh37.p9:chr5\",\n
\ \"MD5:0740173db9ffd264d728f32784845cd7\",\n \"NCBI:NC_000005.9\",\n
\ \"refseq:NC_000005.9\",\n \"SEGUID:Ja+pA+dtRy6jSKdOZXN58wY0rK4\",\n
\ \"SHA1:25afa903e76d472ea348a74e657379f30634acae\",\n \"VMC:GS_vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n
\ \"sha512t24u:vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n \"ga4gh:SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX\",\n
\ \"hs37-1kg:5\",\n \"hs37d5:5\"\n ],\n \"alphabet\": \"ACGNT\",\n
\ \"length\": 180915260\n}\n"
headers:
Connection:
- close
Content-Length:
- '803'
Content-Type:
- application/json
Date:
- Fri, 05 Apr 2024 19:19:57 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000009.11
response:
body:
string: "{\n \"added\": \"2016-08-24T05:18:51Z\",\n \"aliases\": [\n \"GRCh37:9\",\n
\ \"GRCh37:chr9\",\n \"GRCh37.p10:9\",\n \"GRCh37.p10:chr9\",\n \"GRCh37.p11:9\",\n
\ \"GRCh37.p11:chr9\",\n \"GRCh37.p12:9\",\n \"GRCh37.p12:chr9\",\n
\ \"GRCh37.p13:9\",\n \"GRCh37.p13:chr9\",\n \"GRCh37.p2:9\",\n \"GRCh37.p2:chr9\",\n
\ \"GRCh37.p5:9\",\n \"GRCh37.p5:chr9\",\n \"GRCh37.p9:9\",\n \"GRCh37.p9:chr9\",\n
\ \"MD5:3e273117f15e0a400f01055d9f393768\",\n \"NCBI:NC_000009.11\",\n
\ \"refseq:NC_000009.11\",\n \"SEGUID:06BhLSlH1xeVNSYu0zRK9qIVmFg\",\n
\ \"SHA1:d3a0612d2947d7179535262ed3344af6a2159858\",\n \"VMC:GS_HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n
\ \"sha512t24u:HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n \"ga4gh:SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt\",\n
\ \"hs37-1kg:9\",\n \"hs37d5:9\"\n ],\n \"alphabet\": \"ACGNT\",\n
\ \"length\": 141213431\n}\n"
headers:
Connection:
- close
Content-Length:
- '805'
Content-Type:
- application/json
Date:
- Fri, 05 Apr 2024 19:12:51 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ interactions:
Content-Type:
- application/json
Date:
- Tue, 26 Mar 2024 16:28:06 GMT
- Fri, 05 Apr 2024 19:12:51 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000023.11
response:
body:
string: "{\n \"added\": \"2016-08-27T23:57:18Z\",\n \"aliases\": [\n \"GRCh38:X\",\n
\ \"GRCh38:chrX\",\n \"GRCh38.p1:X\",\n \"GRCh38.p1:chrX\",\n \"GRCh38.p10:X\",\n
\ \"GRCh38.p10:chrX\",\n \"GRCh38.p11:X\",\n \"GRCh38.p11:chrX\",\n
\ \"GRCh38.p12:X\",\n \"GRCh38.p12:chrX\",\n \"GRCh38.p2:X\",\n \"GRCh38.p2:chrX\",\n
\ \"GRCh38.p3:X\",\n \"GRCh38.p3:chrX\",\n \"GRCh38.p4:X\",\n \"GRCh38.p4:chrX\",\n
\ \"GRCh38.p5:X\",\n \"GRCh38.p5:chrX\",\n \"GRCh38.p6:X\",\n \"GRCh38.p6:chrX\",\n
\ \"GRCh38.p7:X\",\n \"GRCh38.p7:chrX\",\n \"GRCh38.p8:X\",\n \"GRCh38.p8:chrX\",\n
\ \"GRCh38.p9:X\",\n \"GRCh38.p9:chrX\",\n \"MD5:2b3a55ff7f58eb308420c8a9b11cac50\",\n
\ \"NCBI:NC_000023.11\",\n \"refseq:NC_000023.11\",\n \"SEGUID:Z9QbQrrPjpjXSMJesDYqC3A43lA\",\n
\ \"SHA1:67d41b42bacf8e98d748c25eb0362a0b7038de50\",\n \"VMC:GS_w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n
\ \"sha512t24u:w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n \"ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\"\n
\ ],\n \"alphabet\": \"ACGNRSTWY\",\n \"length\": 156040895\n}\n"
headers:
Connection:
- close
Content-Length:
- '978'
Content-Type:
- application/json
Date:
- Fri, 05 Apr 2024 19:12:51 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
version: 1
95 changes: 94 additions & 1 deletion tests/extras/test_cnv_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,91 @@ def tlr(rest_dataproxy):
'start': 32344742,
'type': 'SequenceLocation'},
'type': 'CopyNumberChange'}
)
),
(
"NC_000023.11:g.(31060227_31100351)_(33274278_33417151)dup",
None,
{
"copyChange": "efo:0030070",
"digest": "H0-_q06in6rsvLfq_5b-CSmP4ZQ6r7-Q",
"id": "ga4gh:CX.H0-_q06in6rsvLfq_5b-CSmP4ZQ6r7-Q",
"location": {
"digest": "R3FeXqOiAu8Vms7QngINQwIxW904fdWY",
"end": [33274278, 33417151],
"id": "ga4gh:SL.R3FeXqOiAu8Vms7QngINQwIxW904fdWY",
"sequenceReference": {
"refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
"type": "SequenceReference"
},
"start": [31060226, 31100350],
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
),
(
"NC_000009.11:g.(?_108337304)_(108337428_?)del",
None,
{
"copyChange": "efo:0030067",
"digest": "ANthOqEGX8MIn0kXuyQcn9bouYfbFgjH",
"id": "ga4gh:CX.ANthOqEGX8MIn0kXuyQcn9bouYfbFgjH",
"location": {
"digest": "lpDGeQvnz80iis8xSxoCX_Pulnu7wx4M",
"end": [108337428, None],
"id": "ga4gh:SL.lpDGeQvnz80iis8xSxoCX_Pulnu7wx4M",
"sequenceReference": {
"refgetAccession": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
"type": "SequenceReference"
},
"start": [None, 108337303],
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
),
(
"NC_000005.9:g.(90136803)_(90159675)dup",
None,
{
"copyChange": "efo:0030070",
"digest": "YcbXUe21Bt1wQDV7zGM0lacOupkxduFS",
"id": "ga4gh:CX.YcbXUe21Bt1wQDV7zGM0lacOupkxduFS",
"location": {
"digest": "r82CARuf8IxOidMdvQCUcsXNp3XiHEVH",
"end": 90159675,
"id": "ga4gh:SL.r82CARuf8IxOidMdvQCUcsXNp3XiHEVH",
"sequenceReference": {
"refgetAccession": "SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX",
"type": "SequenceReference"
},
"start": 90136802,
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
),
(
"NC_000009.11:g.108337304_(108337428_?)del",
None,
{
"copyChange": "efo:0030067",
"digest": "brfJaiKCnSw-mvc3K9sUIEAyCN620PuD",
"id": "ga4gh:CX.brfJaiKCnSw-mvc3K9sUIEAyCN620PuD",
"location": {
"digest": "6myLdODZ8WgbEDXc3HLp88ZbG536NCM-",
"end": [108337428, None],
"id": "ga4gh:SL.6myLdODZ8WgbEDXc3HLp88ZbG536NCM-",
"sequenceReference": {
"refgetAccession": "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt",
"type": "SequenceReference"
},
"start": 108337303,
"type": "SequenceLocation"
},
"type": "CopyNumberChange"
}
)
)


Expand All @@ -76,6 +160,15 @@ def test_from_hgvs_cx(tlr, hgvsexpr ,copy_change, expected):
cx = tlr._from_hgvs(hgvsexpr, copy_change=copy_change)
assert cx.model_dump(exclude_none=True) == expected

@pytest.mark.vcf
def test_from_hgvs_cx_invalid(tlr):
"""test that _from_hgvs works correctly for copy number change invalid input"""
# Should fail since it's not g. or m.
with pytest.raises(
ValueError, match="Only 'g' and 'm' reference sequences are supported"
):
tlr._from_hgvs("NM_001197320.1:c.281_283dup")


from_hgvs_cn_tests = (
("NC_000013.11:g.26440969_26443305del", 1,
Expand Down

0 comments on commit 20133b6

Please sign in to comment.