Skip to content

Commit

Permalink
Validate data source consistency.
Browse files Browse the repository at this point in the history
  • Loading branch information
danielballan committed Feb 24, 2024
1 parent 3f1cd60 commit 517a633
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 4 deletions.
45 changes: 45 additions & 0 deletions tiled/_tests/test_writing.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,3 +464,48 @@ def test_union_one_table(tree):
structure=structure,
)
client.create_union([data_source], key="x")


def test_union_two_tables(tree):
with Context.from_app(build_app(tree)) as context:
client = from_context(context)
df1 = pandas.DataFrame({"A": [], "B": []})
df2 = pandas.DataFrame({"C": [], "D": [], "E": []})
structure1 = TableStructure.from_pandas(df1)
structure2 = TableStructure.from_pandas(df2)
client.create_union(
[
DataSource(
structure_family=StructureFamily.table,
structure=structure1,
),
DataSource(
structure_family=StructureFamily.table,
structure=structure2,
),
],
key="x",
)


def test_union_two_tables_colliding_keys(tree):
with Context.from_app(build_app(tree)) as context:
client = from_context(context)
df1 = pandas.DataFrame({"A": [], "B": []})
df2 = pandas.DataFrame({"A": [], "C": [], "D": []})
structure1 = TableStructure.from_pandas(df1)
structure2 = TableStructure.from_pandas(df2)
with fail_with_status_code(422):
client.create_union(
[
DataSource(
structure_family=StructureFamily.table,
structure=structure1,
),
DataSource(
structure_family=StructureFamily.table,
structure=structure2,
),
],
key="x",
)
6 changes: 6 additions & 0 deletions tiled/catalog/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import operator
import os
import re
import secrets
import shutil
import sys
import uuid
Expand Down Expand Up @@ -629,6 +630,11 @@ async def create_node(
data_uri = str(self.context.writable_storage) + "".join(
f"/{quote_plus(segment)}" for segment in (self.segments + [key])
)
if structure_family == StructureFamily.union:
# Append a random suffix so that multiple data sources have
# unique names.
# TODO Can we do something more elegant?
data_uri += f"_{secrets.token_hex(4)}"
init_storage = DEFAULT_INIT_STORAGE[data_source.structure_family]
assets = await ensure_awaitable(
init_storage, data_uri, data_source.structure
Expand Down
5 changes: 4 additions & 1 deletion tiled/client/union.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@

class UnionClient(BaseClient):
def __repr__(self):
return f"<{type(self).__name__}>"
return (
f"<{type(self).__name__} "
f"[{', '.join(item.structure_family for item in self.structure().contents)}]>"
)
6 changes: 5 additions & 1 deletion tiled/server/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pydantic.generics

from ..structures.core import StructureFamily
from ..structures.data_source import Management
from ..structures.data_source import Management, validate_data_sources
from .pydantic_array import ArrayStructure
from .pydantic_awkward import AwkwardStructure
from .pydantic_sparse import SparseStructure
Expand Down Expand Up @@ -410,6 +410,10 @@ def specs_uniqueness_validator(cls, v):
raise pydantic.errors.ListUniqueItemsError()
return v

@pydantic.validator("data_sources", always=True)
def check_consistency(cls, v, values):
return validate_data_sources(values["structure_family"], v)


class PostMetadataResponse(pydantic.BaseModel, Generic[ResourceLinksT]):
id: str
Expand Down
9 changes: 9 additions & 0 deletions tiled/structures/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
from typing import Optional


class BaseStructureFamily(str, enum.Enum):
array = "array"
awkward = "awkward"
container = "container"
sparse = "sparse"
table = "table"
# excludes union, which DataSources cannot have


class StructureFamily(str, enum.Enum):
array = "array"
awkward = "awkward"
Expand Down
56 changes: 54 additions & 2 deletions tiled/structures/data_source.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import collections
import dataclasses
import enum
from typing import Any, List, Optional

from .core import StructureFamily
from ..structures.core import BaseStructureFamily, StructureFamily


class Management(str, enum.Enum):
Expand All @@ -23,11 +24,62 @@ class Asset:

@dataclasses.dataclass
class DataSource:
structure_family: StructureFamily
structure_family: BaseStructureFamily
structure: Any
id: Optional[int] = None
mimetype: Optional[str] = None
parameters: dict = dataclasses.field(default_factory=dict)
assets: List[Asset] = dataclasses.field(default_factory=list)
management: Management = Management.writable
key: Optional[str] = None


def validate_data_sources(node_structure_family, data_sources):
"Check that data sources are consistent."
return validators[node_structure_family](node_structure_family, data_sources)


def validate_container_data_sources(node_structure_family, data_sources):
if len(data_sources) > 1:
raise ValueError(
"A container node can be backed by 0 or 1 data source, "
f"not {len(data_sources)}"
)
return data_sources


def validate_union_data_sources(node_structure_family, data_sources):
"Check that column names and keys of others (e.g. arrays) do not collide."
keys = set()
for data_source in data_sources:
if data_source.structure_family == StructureFamily.table:
columns = data_source.structure.columns
if keys.intersection(columns):
raise ValueError(
f"Two data sources provide colliding keys: {keys.intersection(columns)}"
)
keys.update(columns)
else:
key = data_source.key
if key is None:
raise ValueError(
f"Data source of type {data_source.structure_family} "
"must have a non-NULL key."
)
if key in keys:
raise ValueError(f"Collision: {key}")
keys.add(key)
return data_sources


def validate_other_data_sources(node_structure_family, data_sources):
if len(data_sources) != 1:
raise ValueError(
f"A {node_structure_family} node must be backed by 1 data source."
)
return data_sources


validators = collections.defaultdict(lambda: validate_other_data_sources)
validators[StructureFamily.container] = validate_container_data_sources
validators[StructureFamily.union] = validate_union_data_sources

0 comments on commit 517a633

Please sign in to comment.