From 517a633e04a8d735034f2f014e720e8369c7b63f Mon Sep 17 00:00:00 2001 From: Dan Allan Date: Sat, 24 Feb 2024 17:00:25 -0500 Subject: [PATCH] Validate data source consistency. --- tiled/_tests/test_writing.py | 45 ++++++++++++++++++++++++++ tiled/catalog/adapter.py | 6 ++++ tiled/client/union.py | 5 ++- tiled/server/schemas.py | 6 +++- tiled/structures/core.py | 9 ++++++ tiled/structures/data_source.py | 56 +++++++++++++++++++++++++++++++-- 6 files changed, 123 insertions(+), 4 deletions(-) diff --git a/tiled/_tests/test_writing.py b/tiled/_tests/test_writing.py index 6bc0ff3ea..6e26c0ca7 100644 --- a/tiled/_tests/test_writing.py +++ b/tiled/_tests/test_writing.py @@ -464,3 +464,48 @@ def test_union_one_table(tree): structure=structure, ) client.create_union([data_source], key="x") + + +def test_union_two_tables(tree): + with Context.from_app(build_app(tree)) as context: + client = from_context(context) + df1 = pandas.DataFrame({"A": [], "B": []}) + df2 = pandas.DataFrame({"C": [], "D": [], "E": []}) + structure1 = TableStructure.from_pandas(df1) + structure2 = TableStructure.from_pandas(df2) + client.create_union( + [ + DataSource( + structure_family=StructureFamily.table, + structure=structure1, + ), + DataSource( + structure_family=StructureFamily.table, + structure=structure2, + ), + ], + key="x", + ) + + +def test_union_two_tables_colliding_keys(tree): + with Context.from_app(build_app(tree)) as context: + client = from_context(context) + df1 = pandas.DataFrame({"A": [], "B": []}) + df2 = pandas.DataFrame({"A": [], "C": [], "D": []}) + structure1 = TableStructure.from_pandas(df1) + structure2 = TableStructure.from_pandas(df2) + with fail_with_status_code(422): + client.create_union( + [ + DataSource( + structure_family=StructureFamily.table, + structure=structure1, + ), + DataSource( + structure_family=StructureFamily.table, + structure=structure2, + ), + ], + key="x", + ) diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index 0a20521b1..6039a0ab5 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -4,6 +4,7 @@ import operator import os import re +import secrets import shutil import sys import uuid @@ -629,6 +630,11 @@ async def create_node( data_uri = str(self.context.writable_storage) + "".join( f"/{quote_plus(segment)}" for segment in (self.segments + [key]) ) + if structure_family == StructureFamily.union: + # Append a random suffix so that multiple data sources have + # unique names. + # TODO Can we do something more elegant? + data_uri += f"_{secrets.token_hex(4)}" init_storage = DEFAULT_INIT_STORAGE[data_source.structure_family] assets = await ensure_awaitable( init_storage, data_uri, data_source.structure diff --git a/tiled/client/union.py b/tiled/client/union.py index 0ffbfb173..c94e4a060 100644 --- a/tiled/client/union.py +++ b/tiled/client/union.py @@ -3,4 +3,7 @@ class UnionClient(BaseClient): def __repr__(self): - return f"<{type(self).__name__}>" + return ( + f"<{type(self).__name__} " + f"[{', '.join(item.structure_family for item in self.structure().contents)}]>" + ) diff --git a/tiled/server/schemas.py b/tiled/server/schemas.py index 6b28a35ba..9f7fe210e 100644 --- a/tiled/server/schemas.py +++ b/tiled/server/schemas.py @@ -11,7 +11,7 @@ import pydantic.generics from ..structures.core import StructureFamily -from ..structures.data_source import Management +from ..structures.data_source import Management, validate_data_sources from .pydantic_array import ArrayStructure from .pydantic_awkward import AwkwardStructure from .pydantic_sparse import SparseStructure @@ -410,6 +410,10 @@ def specs_uniqueness_validator(cls, v): raise pydantic.errors.ListUniqueItemsError() return v + @pydantic.validator("data_sources", always=True) + def check_consistency(cls, v, values): + return validate_data_sources(values["structure_family"], v) + class PostMetadataResponse(pydantic.BaseModel, Generic[ResourceLinksT]): id: str diff --git a/tiled/structures/core.py b/tiled/structures/core.py index c11b8f234..e5e794f47 100644 --- a/tiled/structures/core.py +++ b/tiled/structures/core.py @@ -9,6 +9,15 @@ from typing import Optional +class BaseStructureFamily(str, enum.Enum): + array = "array" + awkward = "awkward" + container = "container" + sparse = "sparse" + table = "table" + # excludes union, which DataSources cannot have + + class StructureFamily(str, enum.Enum): array = "array" awkward = "awkward" diff --git a/tiled/structures/data_source.py b/tiled/structures/data_source.py index 2a985b5d8..e8de6d063 100644 --- a/tiled/structures/data_source.py +++ b/tiled/structures/data_source.py @@ -1,8 +1,9 @@ +import collections import dataclasses import enum from typing import Any, List, Optional -from .core import StructureFamily +from ..structures.core import BaseStructureFamily, StructureFamily class Management(str, enum.Enum): @@ -23,7 +24,7 @@ class Asset: @dataclasses.dataclass class DataSource: - structure_family: StructureFamily + structure_family: BaseStructureFamily structure: Any id: Optional[int] = None mimetype: Optional[str] = None @@ -31,3 +32,54 @@ class DataSource: assets: List[Asset] = dataclasses.field(default_factory=list) management: Management = Management.writable key: Optional[str] = None + + +def validate_data_sources(node_structure_family, data_sources): + "Check that data sources are consistent." + return validators[node_structure_family](node_structure_family, data_sources) + + +def validate_container_data_sources(node_structure_family, data_sources): + if len(data_sources) > 1: + raise ValueError( + "A container node can be backed by 0 or 1 data source, " + f"not {len(data_sources)}" + ) + return data_sources + + +def validate_union_data_sources(node_structure_family, data_sources): + "Check that column names and keys of others (e.g. arrays) do not collide." + keys = set() + for data_source in data_sources: + if data_source.structure_family == StructureFamily.table: + columns = data_source.structure.columns + if keys.intersection(columns): + raise ValueError( + f"Two data sources provide colliding keys: {keys.intersection(columns)}" + ) + keys.update(columns) + else: + key = data_source.key + if key is None: + raise ValueError( + f"Data source of type {data_source.structure_family} " + "must have a non-NULL key." + ) + if key in keys: + raise ValueError(f"Collision: {key}") + keys.add(key) + return data_sources + + +def validate_other_data_sources(node_structure_family, data_sources): + if len(data_sources) != 1: + raise ValueError( + f"A {node_structure_family} node must be backed by 1 data source." + ) + return data_sources + + +validators = collections.defaultdict(lambda: validate_other_data_sources) +validators[StructureFamily.container] = validate_container_data_sources +validators[StructureFamily.union] = validate_union_data_sources