Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #334: Normalization #336

Merged
merged 13 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
322 changes: 286 additions & 36 deletions notebooks/testingstuff.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/ga4gh/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
parse_ga4gh_identifier, GA4GHComputeIdentifierWhen, use_ga4gh_compute_identifier_when
)
from ._internal.pydantic import (
is_pydantic_instance, is_curie_type, is_identifiable, is_literal, pydantic_copy
is_pydantic_instance, is_curie_type, is_ga4gh_identifiable, is_literal, pydantic_copy
)
from ._internal import models as core_models

Expand Down
6 changes: 3 additions & 3 deletions src/ga4gh/core/_internal/enderef.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
is_pydantic_instance,
is_list,
is_curie_type,
is_identifiable,
is_ga4gh_identifiable,
get_pydantic_root,
pydantic_copy)

Expand Down Expand Up @@ -60,7 +60,7 @@ def _enref(o):

if not is_pydantic_instance(o):
raise ValueError("Called ga4gh_enref() with non-pydantic instance")
if not is_identifiable(o):
if not is_ga4gh_identifiable(o):
raise ValueError("Called ga4gh_enref() with non-identifiable object")

# in-place replacement on object copy
Expand Down Expand Up @@ -101,7 +101,7 @@ def _deref(o):

if not is_pydantic_instance(o):
raise ValueError("Called ga4gh_deref() with non-non-pydantic instance")
if not is_identifiable(o):
if not is_ga4gh_identifiable(o):
raise ValueError("Called ga4gh_deref() with non-identifiable object")

# in-place replacement on object copy
Expand Down
46 changes: 23 additions & 23 deletions src/ga4gh/core/_internal/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from .pydantic import (
is_pydantic_instance,
is_curie_type,
is_identifiable,
is_ga4gh_identifiable,
getattr_in,
get_pydantic_root,
is_pydantic_custom_type
Expand Down Expand Up @@ -143,7 +143,7 @@ def ga4gh_identify(vro):
'ga4gh:VSL.u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx'

"""
if is_identifiable(vro):
if is_ga4gh_identifiable(vro):
when_rule = ga4gh_compute_identifier_when.get(GA4GHComputeIdentifierWhen.ALWAYS)
do_compute = False
ir = None
Expand Down Expand Up @@ -281,7 +281,7 @@ def identify_all(

if is_pydantic_custom_type(input_obj):
val = export_pydantic_model(input_obj)
if isinstance(val, str) and is_curie_type(val) and is_ga4gh_identifier(val):
if isinstance(val, str) and is_ga4gh_identifier(val):
val = parse_ga4gh_identifier(val)["digest"]
output_obj = val
elif is_pydantic_instance(input_obj):
Expand All @@ -307,7 +307,7 @@ def identify_all(
# Assumes any obj with 'digest' should be collapsed.
collapsed_output_obj = collapse_identifiable_values(output_obj)
# Add a digest to the output if it is identifiable
if is_identifiable(input_obj):
if is_ga4gh_identifiable(input_obj):
# Compute digest for updated object, not re-running compaction
output_obj["digest"] = ga4gh_digest(collapsed_output_obj, do_compact=False)
else:
Expand All @@ -317,22 +317,22 @@ def identify_all(
return output_obj


def scrape_model_metadata(obj, meta={}) -> dict:
"""
For a Pydantic object obj, pull out .ga4gh.identifiable
and .ga4gh.keys and put them in meta keyed by the class name of obj
"""
assert isinstance(obj, BaseModel)
name = type(obj).__name__
if is_pydantic_custom_str_type(obj):
meta[name] = {"identifiable": False, "keys": None}
else:
meta[name] = {}
identifiable = getattr_in(obj, ["ga4gh", "identifiable"])
if identifiable:
meta[name]["identifiable"] = identifiable
keys = getattr_in(obj, ["ga4gh", "keys"])
if keys and len(keys) > 0:
meta[name]["keys"] = keys
# TODO recurse into fields
return meta
# def scrape_model_metadata(obj, meta={}) -> dict:
# """
# For a Pydantic object obj, pull out .ga4gh.identifiable
# and .ga4gh.keys and put them in meta keyed by the class name of obj
# """
# assert isinstance(obj, BaseModel)
# name = type(obj).__name__
# if is_pydantic_custom_str_type(obj):
# meta[name] = {"identifiable": False, "keys": None}
# else:
# meta[name] = {}
# identifiable = getattr_in(obj, ["ga4gh", "identifiable"])
# if identifiable:
# meta[name]["identifiable"] = identifiable
# keys = getattr_in(obj, ["ga4gh", "keys"])
# if keys and len(keys) > 0:
# meta[name]["keys"] = keys
# # TODO recurse into fields
# return meta
10 changes: 5 additions & 5 deletions src/ga4gh/core/_internal/pydantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ def getattr_in(obj, names) -> Any:
return v


def is_identifiable(o: Any) -> bool:
def is_ga4gh_identifiable(o: Any) -> bool:
"""
Determine if object is identifiable. An object is considered identifiable if
contains a `ga4gh_digest` attribute
Determine if object is GA4GH identifiable. An object is considered
GA4GH identifiable if it contains a `ga4gh_prefix` attribute

:param o: Object
:return: `True` if `o` has `ga4gh_digest` attribute. `False` otherwise.
:return: `True` if `o` has `ga4gh_prefix` attribute. `False` otherwise.
"""
return getattr_in(o, ['ga4gh', 'identifiable'])
return bool(getattr_in(o, ['ga4gh', 'prefix']))


def is_literal(o: Any) -> bool:
Expand Down
27 changes: 20 additions & 7 deletions src/ga4gh/vrs/_internal/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pydantic import BaseModel, ConfigDict, Field, RootModel, constr

from ga4gh.core._internal.pydantic import (
is_identifiable,
is_ga4gh_identifiable,
getattr_in
)
from ga4gh.core._internal.models import IRI, _Entity
Expand Down Expand Up @@ -91,7 +91,7 @@ def pydantic_class_refatt_map():
# Types directly reffable
reffable_classes = list(filter(
lambda c: ('id' in c.model_fields
and is_identifiable(c)),
and is_ga4gh_identifiable(c)),
model_classes
))
# Types reffable because they are a union of reffable types
Expand Down Expand Up @@ -172,16 +172,17 @@ class _ValueObject(_Entity):
description='A sha512t24u digest created using the VRS Computed Identifier algorithm.',
)

class ga4gh:
keys: List[str]


class _Ga4ghIdentifiableObject(_ValueObject):
"""A contextual value object for which a GA4GH computed identifier can be created."""

type: str

class ga4gh:
identifiable = True
class ga4gh(_ValueObject.ga4gh):
prefix: str
keys: List[str]


class Expression(BaseModel):
Expand Down Expand Up @@ -239,7 +240,7 @@ class SequenceReference(_ValueObject):
)
residueAlphabet: Optional[ResidueAlphabet] = None

class ga4gh:
class ga4gh(_ValueObject.ga4gh):
assigned: bool = Field(
True,
description='This special property indicates that the `digest` field follows an alternate convention and is expected to have the value assigned following that convention. For SequenceReference, it is expected the digest will be the refget accession value without the `SQ.` prefix.'
Expand All @@ -262,6 +263,13 @@ class ReferenceLengthExpression(_ValueObject):
None, description='The number of residues in the repeat subunit.'
)

class ga4gh(_ValueObject.ga4gh):
keys = [
'length',
'repeatSubunitLength',
'type'
]


class LiteralSequenceExpression(_ValueObject):
"""An explicit expression of a Sequence."""
Expand All @@ -271,6 +279,12 @@ class LiteralSequenceExpression(_ValueObject):
)
sequence: SequenceString = Field(..., description='the literal sequence')

class ga4gh(_ValueObject.ga4gh):
keys = [
'sequence',
'type'
]


class SequenceLocation(_Ga4ghIdentifiableObject):
"""A `Location` defined by an interval on a referenced `Sequence`."""
Expand Down Expand Up @@ -407,7 +421,6 @@ class GenotypeMember(_ValueObject):
)

class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
identifiable = False
keys = [
'type',
'count',
Expand Down
Binary file modified tests/extras/data/test_vcf_expected_altsonly_output.vcf.gz
Binary file not shown.
Binary file modified tests/extras/data/test_vcf_expected_output.vcf.gz
Binary file not shown.
Binary file modified tests/extras/data/test_vcf_expected_output_no_vrs_attrs.vcf.gz
Binary file not shown.
11 changes: 6 additions & 5 deletions tests/extras/test_allele_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def tlr(rest_dataproxy):
insertion_output = {
"location": {
"end": 20003010,
"start":20003010,
"start": 20003010,
"sequenceReference": {
"refgetAccession": "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
"type": "SequenceReference"
Expand Down Expand Up @@ -265,6 +265,7 @@ def test_to_spdi(tlr):
assert 1 == len(to_spdi)
assert spdiexpr == to_spdi[0]


hgvs_tests = (
("NC_000013.11:g.32936732=", {
"id": "ga4gh:VA.GuDPEe-WojSx4b4DxupN3si1poaR61qL",
Expand Down Expand Up @@ -303,7 +304,7 @@ def test_to_spdi(tlr):
"type": "Allele"
}),
("NC_000007.14:g.55181220del", {
"id": "ga4gh:VA.BNV6SfAuqDYKTTRknLcS-QuTryF5rSBi",
"id": "ga4gh:VA.wlYnlMsWc0ZTPZb-nQv2dXHbFcXa6J9u",
"location": {
"id": "ga4gh:SL.hnIOG_kul0Lf3mO1ddTRFb0GbQhtQ19t",
"end": 55181220,
Expand Down Expand Up @@ -341,7 +342,7 @@ def test_to_spdi(tlr):
"type": "Allele"
}),
("NC_000013.11:g.32331093_32331094dup", {
"id": "ga4gh:VA.g-q4OzcyYFC5eVQFSrbXwgJScSREvrY-",
"id": "ga4gh:VA.x5iNzjjXbb1-wWTBLMBcicYlCMwYoedq",
"location": {
"id": "ga4gh:SL.PJ8lHWhAMNRSrxHvkarfDjRWxF-GwaJ_",
"end": 32331094,
Expand All @@ -361,7 +362,7 @@ def test_to_spdi(tlr):
"type": "Allele"
}),
("NC_000013.11:g.32316467dup", {
"id": "ga4gh:VA._KlbF6GZCbuLxbL9z4hZE3oZSLzBHstS",
"id": "ga4gh:VA.ZAyA7Mmd7ERWN6CEd6muxn2mk_gTvEvF",
"location": {
"id": "ga4gh:SL.LURTeRdwh5bQf_QqPBoaA--MECYmrY5U",
"end": 32316467,
Expand Down Expand Up @@ -434,7 +435,7 @@ def test_hgvs(tlr, hgvsexpr, expected):

def test_to_hgvs_invalid(tlr):
# IRI is passed
iri_vo = models.Allele(
iri_vo = models.Allele(
**{
"location": {
"end": 1263,
Expand Down
18 changes: 15 additions & 3 deletions tests/extras/test_vcf_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@

TEST_DATA_DIR = "tests/extras/data"


@pytest.fixture
def vcf_annotator():
return VCFAnnotator("rest")


@pytest.mark.vcr
def test_annotate_vcf_grch38_noattrs(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -26,12 +28,14 @@ def test_annotate_vcf_grch38_noattrs(vcf_annotator, vcr_cassette):
out_vcf_lines = out_vcf.readlines()
with gzip.open(expected_vcf_no_vrs_attrs, "rt") as expected_output:
expected_output_lines = expected_output.readlines()
assert out_vcf_lines == expected_output_lines
for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines):
assert actual_line == expected_line
assert os.path.exists(output_vrs_pkl)
assert vcr_cassette.all_played
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_grch38_attrs(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -46,12 +50,14 @@ def test_annotate_vcf_grch38_attrs(vcf_annotator, vcr_cassette):
out_vcf_lines = out_vcf.readlines()
with gzip.open(expected_vcf, "rt") as expected_output:
expected_output_lines = expected_output.readlines()
assert out_vcf_lines == expected_output_lines
for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines):
assert actual_line == expected_line
assert os.path.exists(output_vrs_pkl)
assert vcr_cassette.all_played
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_grch38_attrs_altsonly(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -66,12 +72,14 @@ def test_annotate_vcf_grch38_attrs_altsonly(vcf_annotator, vcr_cassette):
out_vcf_lines = out_vcf.readlines()
with gzip.open(expected_altsonly_vcf, "rt") as expected_output:
expected_output_lines = expected_output.readlines()
assert out_vcf_lines == expected_output_lines
for actual_line, expected_line in zip(out_vcf_lines, expected_output_lines):
assert actual_line == expected_line
assert os.path.exists(output_vrs_pkl)
assert vcr_cassette.all_played
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_grch37_attrs(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -92,6 +100,7 @@ def test_annotate_vcf_grch37_attrs(vcf_annotator, vcr_cassette):
os.remove(output_vcf)
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_pickle_only(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -106,6 +115,7 @@ def test_annotate_vcf_pickle_only(vcf_annotator, vcr_cassette):
assert vcr_cassette.all_played
os.remove(output_vrs_pkl)


@pytest.mark.vcr
def test_annotate_vcf_vcf_only(vcf_annotator, vcr_cassette):
vcr_cassette.allow_playback_repeats = False
Expand All @@ -125,13 +135,15 @@ def test_annotate_vcf_vcf_only(vcf_annotator, vcr_cassette):
assert not os.path.exists(output_vrs_pkl)
os.remove(output_vcf)


def test_annotate_vcf_input_validation(vcf_annotator):
input_vcf = f"{TEST_DATA_DIR}/test_vcf_input.vcf"

with pytest.raises(VCFAnnotatorException) as e:
vcf_annotator.annotate(input_vcf)
assert str(e.value) == "Must provide one of: `vcf_out` or `vrs_pickle_out`"


@pytest.mark.vcr
def test_get_vrs_object_invalid_input(vcf_annotator, caplog):
"""Test that _get_vrs_object method works as expected with invalid input"""
Expand Down
Loading