From d45b973306556bb5da63de700b9eb4ce70ca7412 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Mon, 10 Jul 2023 08:43:07 -0700 Subject: [PATCH] handle missing data_type --- dbt/adapters/bigquery/column.py | 32 ++++++++++++++++---------- dbt/adapters/bigquery/impl.py | 2 +- tests/unit/test_column.py | 40 +++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/dbt/adapters/bigquery/column.py b/dbt/adapters/bigquery/column.py index 2b8f9a1e9..a5a60cfc0 100644 --- a/dbt/adapters/bigquery/column.py +++ b/dbt/adapters/bigquery/column.py @@ -133,7 +133,7 @@ def column_to_bq_schema(self) -> SchemaField: def get_nested_column_data_types( columns: Dict[str, Dict[str, Any]], constraints: Optional[Dict[str, str]] = None, -) -> Dict[str, Dict[str, str]]: +) -> Dict[str, Dict[str, Optional[str]]]: """ columns: * Dictionary where keys are of flat columns names and values are dictionary of column attributes @@ -161,16 +161,16 @@ def get_nested_column_data_types( """ constraints = constraints or {} - nested_column_data_types: Dict[str, Union[str, Dict]] = {} + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]] = {} for column in columns.values(): _update_nested_column_data_types( column["name"], - column["data_type"], + column.get("data_type"), constraints.get(column["name"]), nested_column_data_types, ) - formatted_nested_column_data_types: Dict[str, Dict[str, str]] = {} + formatted_nested_column_data_types: Dict[str, Dict[str, Optional[str]]] = {} for column_name, unformatted_column_type in nested_column_data_types.items(): formatted_nested_column_data_types[column_name] = { "name": column_name, @@ -193,9 +193,9 @@ def get_nested_column_data_types( def _update_nested_column_data_types( column_name: str, - column_data_type: str, + column_data_type: Optional[str], column_rendered_constraint: Optional[str], - nested_column_data_types: Dict[str, Union[str, Dict]], + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]], ) -> None: """ Recursively update nested_column_data_types given a column_name, column_data_type, and optional column_rendered_constraint. @@ -218,9 +218,13 @@ def _update_nested_column_data_types( if len(column_name_parts) == 1: # Base case: column is not nested - store its data_type concatenated with constraint if provided. column_data_type_and_constraints = ( - column_data_type - if column_rendered_constraint is None - else f"{column_data_type} {column_rendered_constraint}" + ( + column_data_type + if column_rendered_constraint is None + else f"{column_data_type} {column_rendered_constraint}" + ) + if column_data_type + else None ) if existing_nested_column_data_type := nested_column_data_types.get(root_column_name): @@ -258,7 +262,9 @@ def _update_nested_column_data_types( ) -def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, Any]]) -> str: +def _format_nested_data_type( + unformatted_nested_data_type: Optional[Union[str, Dict[str, Any]]] +) -> Optional[str]: """ Recursively format a (STRUCT) data type given an arbitrarily nested data type structure. @@ -270,7 +276,9 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, >>> BigQueryAdapter._format_nested_data_type({'c': 'string not_null', 'd': {'e': 'string'}}) 'struct>' """ - if isinstance(unformatted_nested_data_type, str): + if unformatted_nested_data_type is None: + return None + elif isinstance(unformatted_nested_data_type, str): return unformatted_nested_data_type else: parent_data_type, *parent_constraints = unformatted_nested_data_type.pop( @@ -278,7 +286,7 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, ).split() or [None] formatted_nested_types = [ - f"{column_name} {_format_nested_data_type(column_type)}" + f"{column_name} {_format_nested_data_type(column_type) or ''}".strip() for column_name, column_type in unformatted_nested_data_type.items() ] diff --git a/dbt/adapters/bigquery/impl.py b/dbt/adapters/bigquery/impl.py index 5c965ca7c..353be08d8 100644 --- a/dbt/adapters/bigquery/impl.py +++ b/dbt/adapters/bigquery/impl.py @@ -300,7 +300,7 @@ def nest_column_data_types( cls, columns: Dict[str, Dict[str, Any]], constraints: Optional[Dict[str, str]] = None, - ) -> Dict[str, Dict[str, str]]: + ) -> Dict[str, Dict[str, Optional[str]]]: return get_nested_column_data_types(columns, constraints) def get_columns_in_relation(self, relation: BigQueryRelation) -> List[BigQueryColumn]: diff --git a/tests/unit/test_column.py b/tests/unit/test_column.py index 821a711df..10f30594e 100644 --- a/tests/unit/test_column.py +++ b/tests/unit/test_column.py @@ -14,6 +14,12 @@ None, {"a": {"name": "a", "data_type": "string"}}, ), + # Flat column - missing data_type + ( + {"a": {"name": "a"}}, + None, + {"a": {"name": "a", "data_type": None}}, + ), # Flat column - with constraints ( {"a": {"name": "a", "data_type": "string"}}, @@ -32,12 +38,24 @@ None, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - missing data_type + ( + {"b.nested": {"name": "b.nested"}}, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), # Single nested column, 1 level - with constraints ( {"b.nested": {"name": "b.nested", "data_type": "string"}}, {"b.nested": "not null"}, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - with constraints, missing data_type (constraints not valid without data_type) + ( + {"b.nested": {"name": "b.nested"}}, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), # Single nested column, 1 level - with constraints + other keys ( {"b.nested": {"name": "b.nested", "data_type": "string", "other": "unpreserved"}}, @@ -151,6 +169,28 @@ }, }, ), + # Nested columns, multiple levels - missing data_type + ( + { + "b.user.name.first": { + "name": "b.user.name.first", + "data_type": "string", + }, + "b.user.name.last": { + "name": "b.user.name.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country"}, # missing data_type + }, + None, + { + "b": { + "name": "b", + "data_type": "struct, id int64, country>>", + }, + }, + ), # Nested columns, multiple levels - with constraints! ( {