From 0da17dcf4a6a4660ff4fafd4f5c50bd4fc7d62e0 Mon Sep 17 00:00:00 2001 From: joocer Date: Tue, 5 Sep 2023 22:26:11 +0100 Subject: [PATCH] 1.6.0 --- README.md | 13 +-- data_expectations/__init__.py | 22 ++++- data_expectations/internals/__init__.py | 1 - data_expectations/internals/evaluate.py | 7 +- data_expectations/internals/expectations.py | 36 +-------- data_expectations/internals/models.py | 81 ++++--------------- data_expectations/version.py | 2 +- tests/test_documentation.py | 23 +++++- .../test_expect_column_names_to_match_set.py | 31 ------- tests/test_expectation_datamodel.py | 53 ++++-------- tests/test_load_expectations.py | 20 ++--- 11 files changed, 94 insertions(+), 195 deletions(-) delete mode 100644 tests/test_expect_column_names_to_match_set.py diff --git a/README.md b/README.md index 166aab5..c7ac68f 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,6 @@ Expectations can be used alongside, or in place of a schema validator, however E ## Provided Expectations - **expect_column_to_exist** (column) -- **expect_column_names_to_match_set** (columns, ignore_excess:true) - **expect_column_values_to_not_be_null** (column) - **expect_column_values_to_be_of_type** (column, expected_type, ignore_nulls:true) - **expect_column_values_to_be_in_type_list** (column, type_list, ignore_nulls:true) @@ -50,18 +49,20 @@ Data Expectations has no external dependencies, can be used ad hoc and in-the-mo ~~~python import data_expectations as de +from data_expectations import Expectation +from data_expectations import Behaviors -TEST_DATA = {"name":"charles","age":12} +TEST_DATA = {"name": "charles", "age": 12} set_of_expectations = [ - {"expectation": "expect_column_to_exist", "column": "name"}, - {"expectation": "expect_column_to_exist", "column": "age"}, - {"expectation": "expect_column_values_to_be_between", "column": "age", "minimum": 0, "maximum": 120}, + Expectation(Behaviors.EXPECT_COLUMN_TO_EXIST, column="name"), + Expectation(Behaviors.EXPECT_COLUMN_TO_EXIST, column="age"), + Expectation(Behaviors.EXPECT_COLUMN_VALUES_TO_BE_BETWEEN, column="age", config={"minimum": 0, "maximum": 120}), ] expectations = de.Expectations(set_of_expectations) try: de.evaluate_record(expectations, TEST_DATA) -except de.errors.ExpectationNotMetError: +except de.errors.ExpectationNotMetError: # pragma: no cover print("Data Didn't Meet Expectations") ~~~ diff --git a/data_expectations/__init__.py b/data_expectations/__init__.py index 5bedc7f..a7c9445 100644 --- a/data_expectations/__init__.py +++ b/data_expectations/__init__.py @@ -1,5 +1,25 @@ +from enum import Enum + + +class Behaviors(str, Enum): + EXPECT_COLUMN_TO_EXIST = "expect_column_to_exist" + EXPECT_COLUMN_VALUES_TO_NOT_BE_NULL = "expect_column_values_to_not_be_null" + EXPECT_COLUMN_VALUES_TO_BE_OF_TYPE = "expect_column_values_to_be_of_type" + EXPECT_COLUMN_VALUES_TO_BE_IN_TYPE_LIST = "expect_column_values_to_be_in_type_list" + EXPECT_COLUMN_VALUES_TO_BE_MORE_THAN = "expect_column_values_to_be_more_than" + EXPECT_COLUMN_VALUES_TO_BE_LESS_THAN = "expect_column_values_to_be_less_than" + EXPECT_COLUMN_VALUES_TO_BE_BETWEEN = "expect_column_values_to_be_between" + EXPECT_COLUMN_VALUES_TO_BE_INCREASING = "expect_column_values_to_be_increasing" + EXPECT_COLUMN_VALUES_TO_BE_DECREASING = "expect_column_values_to_be_decreasing" + EXPECT_COLUMN_VALUES_TO_BE_IN_SET = "expect_column_values_to_be_in_set" + EXPECT_COLUMN_VALUES_TO_MATCH_REGEX = "expect_column_values_to_match_regex" + EXPECT_COLUMN_VALUES_TO_MATCH_LIKE = "expect_column_values_to_match_like" + EXPECT_COLUMN_VALUES_LENGTH_TO_BE_BE = "expect_column_values_length_to_be_be" + EXPECT_COLUMN_VALUES_LENGTH_TO_BE_BETWEEN = "expect_column_values_length_to_be_between" + + from data_expectations.internals.expectations import Expectations -from data_expectations.internals.models import Expectation, ColumnExpectation +from data_expectations.internals.models import Expectation from data_expectations.internals.evaluate import evaluate_list from data_expectations.internals.evaluate import evaluate_record diff --git a/data_expectations/internals/__init__.py b/data_expectations/internals/__init__.py index 8b13789..e69de29 100644 --- a/data_expectations/internals/__init__.py +++ b/data_expectations/internals/__init__.py @@ -1 +0,0 @@ - diff --git a/data_expectations/internals/evaluate.py b/data_expectations/internals/evaluate.py index 1d5f484..20bdf5a 100644 --- a/data_expectations/internals/evaluate.py +++ b/data_expectations/internals/evaluate.py @@ -12,7 +12,6 @@ import typing -from data_expectations import ColumnExpectation from data_expectations import Expectations from data_expectations.errors import ExpectationNotMetError from data_expectations.errors import ExpectationNotUnderstoodError @@ -40,11 +39,7 @@ def evaluate_record(expectations: Expectations, record: dict, suppress_errors: b if expectation not in ALL_EXPECTATIONS: raise ExpectationNotUnderstoodError(expectation=expectation) - base_config = {"row": record, **expectation_definition.config} - - # Conditionally include the 'column' parameter - if isinstance(expectation_definition, ColumnExpectation): - base_config["column"] = expectation_definition.column + base_config = {"row": record, "column": expectation_definition.column, **expectation_definition.config} if not ALL_EXPECTATIONS[expectation](**base_config): if not suppress_errors: diff --git a/data_expectations/internals/expectations.py b/data_expectations/internals/expectations.py index 7845d8f..f1f85bc 100644 --- a/data_expectations/internals/expectations.py +++ b/data_expectations/internals/expectations.py @@ -41,7 +41,6 @@ from typing import List from typing import Union -from data_expectations.internals.models import ColumnExpectation from data_expectations.internals.models import Expectation from data_expectations.internals.text import sql_like_to_regex @@ -70,12 +69,9 @@ def __init__(self, set_of_expectations: Iterable[Union[str, dict, Expectation]]) if isinstance(exp, str): # Parse JSON string exp = json.loads(exp) - if isinstance(exp, dict): # Convert dict to Expectation or ColumnExpectation - if "column" in exp: - self.set_of_expectations.append(ColumnExpectation.load(exp)) - else: - self.set_of_expectations.append(Expectation.load(exp)) - elif is_dataclass(exp) and (isinstance(exp, Expectation) or isinstance(exp, ColumnExpectation)): + if isinstance(exp, dict): # Convert dict to Expectation + self.set_of_expectations.append(Expectation.load(exp)) + elif is_dataclass(exp) and isinstance(exp, Expectation): self.set_of_expectations.append(exp) @classmethod @@ -100,32 +96,6 @@ def reset(): # COLUMN EXPECTATIONS ################################################################################### - @staticmethod - def expect_column_names_to_match_set( - *, - row: dict, - columns: list, - ignore_excess: bool = True, - **kwargs, - ): - """ - Confirms that the columns in a record match the given set. - - Parameters: - row: dict - The record to be checked. - columns: list - List of expected column names. - ignore_excess: bool - If True, ignores columns not in the list. If False, ensures columns match the list exactly. - - Returns: bool - True if expectation is met, False otherwise. - """ - if ignore_excess: - return all(key in columns for key in row.keys()) - return sorted(columns) == sorted(list(row.keys())) - @staticmethod def expect_column_to_exist( *, diff --git a/data_expectations/internals/models.py b/data_expectations/internals/models.py index ad7f70a..8c59431 100644 --- a/data_expectations/internals/models.py +++ b/data_expectations/internals/models.py @@ -18,6 +18,8 @@ from typing import Type from typing import Union +from data_expectations import Behaviors + @dataclass class Expectation: @@ -25,94 +27,45 @@ class Expectation: Represents a general Data Expectation. """ - expectation: str + expectation: Behaviors + column: str config: Dict[str, Any] = field(default_factory=dict) + ignore_nulls: bool = True - def to_dict(self) -> Dict[str, Any]: + def dump(self) -> Dict[str, Any]: """ Converts the Expectation instance to a dictionary representation. Returns: A dictionary containing the expectation and its configuration. """ - return {"expectation": self.expectation, **self.config} + return { + "expectation": self.expectation, + "column": self.column, + "ignore_nulls": self.ignore_nulls, + **self.config, + } @classmethod - def load_base(cls: Type["Expectation"], serialized: Union[Dict[str, Any], str]) -> Dict[str, Any]: + def load(cls: Type["Expectation"], serialized: Union[Dict[str, Any], str]) -> "Expectation": """ - Loads a serialized Expectation and returns it as a dictionary. + Loads a serialized Expectation and returns it as an instance. Parameters: serialized: Serialized Expectation as a dictionary or JSON string. Returns: - A dictionary representation of the serialized Expectation. + An Expectation instance populated with the serialized data. """ if isinstance(serialized, str): serialized = dict(json.loads(serialized)) serialized_copy: dict = deepcopy(serialized) if "expectation" not in serialized_copy: raise ValueError("Missing 'expectation' key in Expectation.") - return serialized_copy - - @classmethod - def load(cls: Type["Expectation"], serialized: Union[Dict[str, Any], str]) -> "Expectation": - """ - Loads a serialized Expectation and returns it as an instance. - - Parameters: - serialized: Serialized Expectation as a dictionary or JSON string. - - Returns: - An Expectation instance populated with the serialized data. - """ - serialized_copy = cls.load_base(serialized) - expectation = serialized_copy.pop("expectation") - config = serialized_copy - return cls(expectation=expectation, config=config) - - -class ColumnExpectation(Expectation): - """ - Represents a Data Expectation related to a specific column. - """ - - def __init__(self, expectation: str, column: str, config: Dict[str, Any] = None): - """ - Initializes a ColumnExpectation instance. - - Parameters: - expectation: The expectation type as a string. - column: The column the expectation applies to. - config: Additional configuration as a dictionary. - """ - super().__init__(expectation, config or {}) - self.column = column - - def to_dict(self) -> Dict[str, Any]: - """ - Converts the ColumnExpectation instance to a dictionary representation. - - Returns: - A dictionary containing the expectation, column, and its configuration. - """ - return {"expectation": self.expectation, "column": self.column, **self.config} - - @classmethod - def load(cls: Type["ColumnExpectation"], serialized: Union[Dict[str, Any], str]) -> "ColumnExpectation": - """ - Loads a serialized ColumnExpectation and returns it as an instance. - - Parameters: - serialized: Serialized ColumnExpectation as a dictionary or JSON string. - - Returns: - A ColumnExpectation instance populated with the serialized data. - """ - serialized_copy = cls.load_base(serialized) if "column" not in serialized_copy: raise ValueError("Missing 'column' key in Expectation.") expectation = serialized_copy.pop("expectation") column = serialized_copy.pop("column") + ignore_nulls = serialized_copy.pop("ignore_nulls", True) config = serialized_copy - return cls(expectation=expectation, column=column, config=config) + return cls(expectation=expectation, column=column, ignore_nulls=ignore_nulls, config=config) diff --git a/data_expectations/version.py b/data_expectations/version.py index 431daee..5be518b 100644 --- a/data_expectations/version.py +++ b/data_expectations/version.py @@ -13,6 +13,6 @@ # Store the version here so: # 1) we don't load dependencies by storing it in __init__.py # 2) we can import it in setup.py for the same reason -__version__ = "1.5.0" +__version__ = "1.6.0" # nodoc - don't add to the documentation wiki diff --git a/tests/test_documentation.py b/tests/test_documentation.py index 5493ff2..c52650f 100644 --- a/tests/test_documentation.py +++ b/tests/test_documentation.py @@ -7,7 +7,7 @@ sys.path.insert(1, os.path.join(sys.path[0], "..")) -def test_example(): +def test_example_legacy(): import data_expectations as de TEST_DATA = {"name": "charles", "age": 12} @@ -30,6 +30,27 @@ def test_example(): print("Data Didn't Meet Expectations") +def test_example(): + import data_expectations as de + from data_expectations import Expectation + from data_expectations import Behaviors + + TEST_DATA = {"name": "charles", "age": 12} + + set_of_expectations = [ + Expectation(Behaviors.EXPECT_COLUMN_TO_EXIST, column="name"), + Expectation(Behaviors.EXPECT_COLUMN_TO_EXIST, column="age"), + Expectation(Behaviors.EXPECT_COLUMN_VALUES_TO_BE_BETWEEN, column="age", config={"minimum": 0, "maximum": 120}), + ] + + expectations = de.Expectations(set_of_expectations) + try: + de.evaluate_record(expectations, TEST_DATA) + except de.errors.ExpectationNotMetError: # pragma: no cover + print("Data Didn't Meet Expectations") + + if __name__ == "__main__": # pragma: no cover test_example() + test_example_legacy() print("✅ okay") diff --git a/tests/test_expect_column_names_to_match_set.py b/tests/test_expect_column_names_to_match_set.py deleted file mode 100644 index f6ad899..0000000 --- a/tests/test_expect_column_names_to_match_set.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import sys - -sys.path.insert(1, os.path.join(sys.path[0], "..")) - -import data_expectations as de - - -def test_expect_column_names_to_match_set(): - test_func = de.Expectations.expect_column_names_to_match_set - - assert test_func(row={"number": 7, "string": "d"}, columns=("number", "string")) - assert test_func( - row={"number": 7, "string": "d"}, - columns=("number", "string"), - ignore_excess=True, - ) - assert test_func( - row={"number": 7, "string": "d"}, - columns=("number", "string"), - ignore_excess=False, - ) - - assert test_func(row={"number": 7}, columns=("number", "string")) - assert test_func(row={"number": 7}, columns=("number", "string"), ignore_excess=True) - assert not test_func(row={"number": 7}, columns=("number", "string"), ignore_excess=False) - - -if __name__ == "__main__": # pragma: no cover - test_expect_column_names_to_match_set() - print("✅ okay") diff --git a/tests/test_expectation_datamodel.py b/tests/test_expectation_datamodel.py index ea4576d..6701c15 100644 --- a/tests/test_expectation_datamodel.py +++ b/tests/test_expectation_datamodel.py @@ -6,67 +6,46 @@ import json import pytest -from data_expectations import Expectation, ColumnExpectation - - -def test_expectation_to_dict(): - exp = Expectation("test_expectation", {"some_key": "some_value"}) - assert exp.to_dict() == {"expectation": "test_expectation", "some_key": "some_value"} - - -def test_expectation_load_from_dict(): - serialized = {"expectation": "test_expectation", "some_key": "some_value"} - exp = Expectation.load(serialized) - assert exp.expectation == "test_expectation" - assert exp.config == {"some_key": "some_value"} - - -def test_expectation_load_from_json_str(): - serialized = json.dumps({"expectation": "test_expectation", "some_key": "some_value"}) - exp = Expectation.load(serialized) - assert exp.expectation == "test_expectation" - assert exp.config == {"some_key": "some_value"} - - -def test_expectation_load_missing_key(): - serialized = {"some_key": "some_value"} - with pytest.raises(ValueError): - Expectation.load(serialized) +from data_expectations import Expectation def test_column_expectation_to_dict(): - exp = ColumnExpectation("test_expectation", "test_column", {"some_key": "some_value"}) - assert exp.to_dict() == {"expectation": "test_expectation", "column": "test_column", "some_key": "some_value"} + exp = Expectation("test_expectation", "test_column", {"some_key": "some_value"}) + assert exp.dump() == { + "expectation": "test_expectation", + "column": "test_column", + "some_key": "some_value", + "ignore_nulls": True, + } def test_column_expectation_load_from_dict(): serialized = {"expectation": "test_expectation", "column": "test_column", "some_key": "some_value"} - exp = ColumnExpectation.load(serialized) + exp = Expectation.load(serialized) assert exp.expectation == "test_expectation" assert exp.column == "test_column" assert exp.config == {"some_key": "some_value"} + assert exp.ignore_nulls == True def test_column_expectation_load_from_json_str(): - serialized = json.dumps({"expectation": "test_expectation", "column": "test_column", "some_key": "some_value"}) - exp = ColumnExpectation.load(serialized) + serialized = json.dumps( + {"expectation": "test_expectation", "column": "test_column", "some_key": "some_value", "ignore_nulls": False} + ) + exp = Expectation.load(serialized) assert exp.expectation == "test_expectation" assert exp.column == "test_column" assert exp.config == {"some_key": "some_value"} + assert exp.ignore_nulls == False def test_column_expectation_load_missing_key(): serialized = {"expectation": "test_expectation", "some_key": "some_value"} with pytest.raises(ValueError): - ColumnExpectation.load(serialized) + Expectation.load(serialized) if __name__ == "__main__": # pragma: no cover - test_expectation_to_dict() - test_expectation_load_from_dict() - test_expectation_load_from_json_str() - test_expectation_load_missing_key() - test_column_expectation_to_dict() test_column_expectation_load_from_dict() test_column_expectation_load_from_json_str() diff --git a/tests/test_load_expectations.py b/tests/test_load_expectations.py index a9e3d50..68d7329 100644 --- a/tests/test_load_expectations.py +++ b/tests/test_load_expectations.py @@ -5,35 +5,27 @@ sys.path.insert(1, os.path.join(sys.path[0], "..")) -from data_expectations import Expectations, Expectation, ColumnExpectation +from data_expectations import Expectations, Expectation def test_expectations_initializer(): # Create expectations as different types - exp1 = Expectation("expect_test", {"key1": "value1"}) - exp2_dict = {"expectation": "expect_test2", "key2": "value2"} - exp2 = Expectation.load(exp2_dict) - exp3_json = json.dumps({"expectation": "expect_test3", "key3": "value3"}) - exp3 = Expectation.load(json.loads(exp3_json)) - col_exp1 = ColumnExpectation("expect_test_col", "col1", {"key1": "value1"}) + col_exp1 = Expectation("expect_test_col", "col1", {"key1": "value1"}) col_exp2_dict = {"expectation": "expect_test_col2", "column": "col2", "key2": "value2"} - col_exp2 = ColumnExpectation.load(col_exp2_dict) + col_exp2 = Expectation.load(col_exp2_dict) col_exp3_json = json.dumps({"expectation": "expect_test_col3", "column": "col3", "key3": "value3"}) - col_exp3 = ColumnExpectation.load(json.loads(col_exp3_json)) + col_exp3 = Expectation.load(json.loads(col_exp3_json)) # Initialize Expectations class - expectations = Expectations([exp1, exp2_dict, exp3_json, col_exp1, col_exp2_dict, col_exp3_json]) + expectations = Expectations([col_exp1, col_exp2_dict, col_exp3_json]) # Validate - assert len(expectations.set_of_expectations) == 6 + assert len(expectations.set_of_expectations) == 3 assert isinstance(expectations.set_of_expectations[0], Expectation) assert isinstance(expectations.set_of_expectations[1], Expectation) assert isinstance(expectations.set_of_expectations[2], Expectation) - assert isinstance(expectations.set_of_expectations[3], ColumnExpectation) - assert isinstance(expectations.set_of_expectations[4], ColumnExpectation) - assert isinstance(expectations.set_of_expectations[5], ColumnExpectation) if __name__ == "__main__": # pragma: no cover