diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 55d4bfaf..ba4899f1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,14 @@ ODS_Tools Changelog =================== +`3.1.5`_ + --------- +* [#80](https://github.com/OasisLMF/ODS_Tools/pull/80) - Added fields to analysis settings schema +* [#115](https://github.com/OasisLMF/ODS_Tools/pull/115) - fix issue when categorical column have a default value +* [#117](https://github.com/OasisLMF/ODS_Tools/pull/117) - Fix/fill empty +* [#78](https://github.com/OasisLMF/ODS_Tools/pull/78) - Release 3.1.4 +.. _`3.1.5`: https://github.com/OasisLMF/ODS_Tools/compare/3.1.4...3.1.5 + .. _`3.1.4`: https://github.com/OasisLMF/ODS_Tools/compare/3.1.3...3.1.4 `3.1.3`_ diff --git a/ods_tools/__init__.py b/ods_tools/__init__.py index 9863d825..56472d27 100644 --- a/ods_tools/__init__.py +++ b/ods_tools/__init__.py @@ -1,4 +1,4 @@ -__version__ = '3.1.4' +__version__ = '3.1.5' import logging diff --git a/ods_tools/oed/__init__.py b/ods_tools/oed/__init__.py index 996bf014..bec87ef9 100644 --- a/ods_tools/oed/__init__.py +++ b/ods_tools/oed/__init__.py @@ -5,7 +5,7 @@ from .common import ( OdsException, PANDAS_COMPRESSION_MAP, PANDAS_DEFAULT_NULL_VALUES, USUAL_FILE_NAME, OED_TYPE_TO_NAME, OED_NAME_TO_TYPE, OED_IDENTIFIER_FIELDS, VALIDATOR_ON_ERROR_ACTION, DEFAULT_VALIDATION_CONFIG, OED_PERIL_COLUMNS, fill_empty, - UnknownColumnSaveOption, BLANK_VALUES + UnknownColumnSaveOption, BLANK_VALUES, is_empty ) @@ -13,5 +13,5 @@ 'OedExposure', 'OedSchema', 'OedSource', 'ModelSettingSchema', 'AnalysisSettingSchema', 'OdsException', 'PANDAS_COMPRESSION_MAP', 'PANDAS_DEFAULT_NULL_VALUES', 'USUAL_FILE_NAME', 'OED_TYPE_TO_NAME', 'OED_NAME_TO_TYPE', 'OED_IDENTIFIER_FIELDS', 'VALIDATOR_ON_ERROR_ACTION', 'DEFAULT_VALIDATION_CONFIG', 'OED_PERIL_COLUMNS', 'fill_empty', - 'UnknownColumnSaveOption', 'BLANK_VALUES' + 'UnknownColumnSaveOption', 'BLANK_VALUES', 'is_empty' ] diff --git a/ods_tools/oed/common.py b/ods_tools/oed/common.py index 4cf02008..e1475ea7 100644 --- a/ods_tools/oed/common.py +++ b/ods_tools/oed/common.py @@ -143,6 +143,19 @@ def __get__(self, obj, type=None): BLANK_VALUES = {np.nan, '', None, pd.NA, pd.NaT} +dtype_to_python = { + 'Int8': int, + 'Int32': int, + 'Int64': int, + 'bytes': lambda x: bytes(x, 'utf-8'), + 'float64': float, + 'category': str +} + + +def is_empty(df, columns): + return (df[columns].isnull()) | (df[columns] == '') + def fill_empty(df, columns, value): if isinstance(columns, str): @@ -150,7 +163,7 @@ def fill_empty(df, columns, value): for column in columns: if df[column].dtypes.name == 'category' and value not in {None, np.nan}.union(df[column].cat.categories): df[column] = df[column].cat.add_categories(value) - df.loc[df[column].isin(BLANK_VALUES), column] = value + df.loc[is_empty(df, column), column] = value class UnknownColumnSaveOption(Enum): diff --git a/ods_tools/oed/oed_schema.py b/ods_tools/oed/oed_schema.py index eba6cb87..299eea3b 100644 --- a/ods_tools/oed/oed_schema.py +++ b/ods_tools/oed/oed_schema.py @@ -4,7 +4,7 @@ import numba as nb import numpy as np -from .common import OdsException, BLANK_VALUES, cached_property +from .common import OdsException, BLANK_VALUES, cached_property, dtype_to_python ENV_ODS_SCHEMA_PATH = os.getenv('ODS_SCHEMA_PATH') @@ -121,6 +121,25 @@ def peril_filtering(self, peril_ids, peril_filters): """ return jit_peril_filtering(peril_ids.to_numpy().astype('str'), peril_filters.to_numpy().astype('str'), self.nb_perils_dict) + @staticmethod + def get_default_from_ods_fields(ods_fields, field_name): + field_info = ods_fields.get(field_name.lower()) + if field_info is None: + return '' + if field_info['pd_dtype'] == 'category': + if field_info['Default'] != 'n/a': + return field_info['Default'] + else: + return '' + else: + if field_info['Default'] != 'n/a': + return dtype_to_python[field_info['pd_dtype']](field_info['Default']) + else: + return np.nan + + def get_default(self, field_name, oed_type='null'): + return self.get_default_from_ods_fields(self.schema['input_fields'][oed_type], field_name) + @staticmethod def to_universal_field_name(column: str): """ diff --git a/ods_tools/oed/source.py b/ods_tools/oed/source.py index 3756bf0b..bfae585a 100644 --- a/ods_tools/oed/source.py +++ b/ods_tools/oed/source.py @@ -6,8 +6,8 @@ import numpy as np from chardet.universaldetector import UniversalDetector -from .common import (OED_TYPE_TO_NAME, OdsException, PANDAS_COMPRESSION_MAP, PANDAS_DEFAULT_NULL_VALUES, is_relative, BLANK_VALUES, fill_empty, - UnknownColumnSaveOption, cached_property) +from .common import (OED_TYPE_TO_NAME, OdsException, PANDAS_COMPRESSION_MAP, PANDAS_DEFAULT_NULL_VALUES, is_relative, fill_empty, + UnknownColumnSaveOption, cached_property, is_empty) from .forex import convert_currency from .oed_schema import OedSchema @@ -267,7 +267,7 @@ def as_oed_type(cls, oed_df, column_to_field): if oed_df[column].dtype.name == 'category' and '' not in oed_df[column].dtype.categories: oed_df[column] = oed_df[column].cat.add_categories('') oed_df[column] = oed_df[column] # make a copy f the col in case it is read_only - oed_df.loc[oed_df[column].isin(BLANK_VALUES), column] = '' + oed_df.loc[is_empty(oed_df, column), column] = '' elif pd_dtype[column].startswith('Int'): to_tmp_dtype[column] = 'float' @@ -287,12 +287,7 @@ def prepare_df(cls, df, column_to_field, ods_fields): """ # set default values for col, field_info in column_to_field.items(): - if (field_info - and field_info['Default'] != 'n/a' - and (df[col].isna().any() or (field_info['pd_dtype'] == 'category' and df[col].isnull().any()))): - fill_empty(df, col, df[col].dtype.type(field_info['Default'])) - elif df[col].dtype.name == 'category': - fill_empty(df, col, '') + fill_empty(df, col, OedSchema.get_default_from_ods_fields(ods_fields, col)) # add required columns that allow blank values if missing present_field = set(field_info['Input Field Name'] for field_info in column_to_field.values()) diff --git a/ods_tools/oed/validator.py b/ods_tools/oed/validator.py index 5edfafd7..40fa3184 100644 --- a/ods_tools/oed/validator.py +++ b/ods_tools/oed/validator.py @@ -1,12 +1,13 @@ import functools import json +import numpy as np import logging from pathlib import Path from collections.abc import Iterable from .common import (OdsException, OED_PERIL_COLUMNS, OED_IDENTIFIER_FIELDS, DEFAULT_VALIDATION_CONFIG, - VALIDATOR_ON_ERROR_ACTION, BLANK_VALUES) + VALIDATOR_ON_ERROR_ACTION, BLANK_VALUES, is_empty) from .oed_schema import OedSchema logger = logging.getLogger(__name__) @@ -134,7 +135,7 @@ def check_required_fields(self): columns = [columns] for column in columns: if field_info.get("Allow blanks?").upper() == 'NO': - missing_value_df = oed_source.dataframe[oed_source.dataframe[column].isin(BLANK_VALUES)] + missing_value_df = oed_source.dataframe[is_empty(oed_source.dataframe, column)] if not missing_value_df.empty: invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source, 'msg': f"column '{column}' has missing values in \n" @@ -222,8 +223,9 @@ def check_occupancy_code(self): if occupancy_code_column is None: continue identifier_field = self.identifier_field_maps[oed_source] - invalid_occupancy_code = oed_source.dataframe[~oed_source.dataframe[occupancy_code_column].astype(str).isin( - set(self.exposure.oed_schema.schema['occupancy']) | BLANK_VALUES)] + invalid_occupancy_code = oed_source.dataframe[~(np.isin(oed_source.dataframe[occupancy_code_column].astype(str), + list(self.exposure.oed_schema.schema['occupancy'])) + | is_empty(oed_source.dataframe, occupancy_code_column))] if not invalid_occupancy_code.empty: invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source, 'msg': f"invalid OccupancyCode.\n" @@ -242,8 +244,9 @@ def check_construction_code(self): if construction_code_column is None: continue identifier_field = self.identifier_field_maps[oed_source] - invalid_construction_code = oed_source.dataframe[~oed_source.dataframe[construction_code_column].astype(str).isin( - set(self.exposure.oed_schema.schema['construction']) | BLANK_VALUES)] + invalid_construction_code = oed_source.dataframe[~(np.isin(oed_source.dataframe[construction_code_column].astype(str), + list(self.exposure.oed_schema.schema['construction'])) + | is_empty(oed_source.dataframe, construction_code_column))] if not invalid_construction_code.empty: invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source, 'msg': f"invalid ConstructionCode.\n" @@ -265,8 +268,9 @@ def check_country_and_area_code(self): identifier_field = self.identifier_field_maps[oed_source] area_code_column = self.field_to_column_maps[oed_source].get('AreaCode') if area_code_column is not None: - country_only_df = oed_source.dataframe[oed_source.dataframe[area_code_column].isin(BLANK_VALUES)] - country_area_df = oed_source.dataframe[~oed_source.dataframe[area_code_column].isin(BLANK_VALUES)] + country_only_df = oed_source.dataframe[is_empty(oed_source.dataframe, area_code_column)] + country_area_df = oed_source.dataframe[~is_empty(oed_source.dataframe, area_code_column)] + invalid_country_area = (country_area_df[ ~(country_area_df[[country_code_column, area_code_column]] .apply(tuple, axis=1) @@ -279,8 +283,9 @@ def check_country_and_area_code(self): f"{invalid_country_area[identifier_field + [country_code_column, area_code_column]]}"}) else: country_only_df = oed_source.dataframe - invalid_country = (country_only_df[~country_only_df[country_code_column] - .isin(set(self.exposure.oed_schema.schema['country']) | BLANK_VALUES)]) + invalid_country = (country_only_df[~(np.isin(country_only_df[country_code_column], + list(self.exposure.oed_schema.schema['country'])) + | is_empty(country_only_df, country_code_column))]) if not invalid_country.empty: invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source, 'msg': f"invalid CountryCode.\n" diff --git a/tests/test_ods_package.py b/tests/test_ods_package.py index 35d739c7..fb89f61c 100644 --- a/tests/test_ods_package.py +++ b/tests/test_ods_package.py @@ -109,6 +109,24 @@ def test_load_oed_from_config(self): exposure2 = OedExposure(**config) self.assertTrue(exposure.location.dataframe.equals(exposure2.location.dataframe)) + def test_categorical_with_default(self): + # UseReinsDates is a string column with a non null default, check default setting works + with tempfile.TemporaryDirectory() as tmp_run_dir: + config = { + 'ri_info': base_url + '/SourceReinsInfoOEDPiWind.csv', + 'use_field': True + } + exposure = OedExposure(**config) + exposure.ri_info.dataframe['UseReinsDates'] = None + exposure.ri_info.dataframe.to_csv(os.path.join(tmp_run_dir, 'ri_info.csv'), index=False) + + exposure = OedExposure(**{ + 'ri_info': os.path.join(tmp_run_dir, 'ri_info.csv'), + 'use_field': True + }) + ri_scope = exposure.ri_info.dataframe + self.assertTrue(isinstance(ri_scope, pd.DataFrame)) + def test_load_oed_from_df(self): location_df = pd.DataFrame({ 'PortNumber': [1, 1], @@ -387,7 +405,11 @@ def test_field_required_allow_blank_are_set_to_default(self): 'use_field': True}) original_exposure.location.dataframe.drop(columns=['ContentsTIV'], inplace=True) - original_exposure.location.dataframe['BITIV'] = np.nan + original_exposure.location.dataframe['BITIV'] = pd.NA + original_exposure.location.dataframe.loc[[1], 'BITIV'] = np.nan + original_exposure.location.dataframe.loc[[2], 'BITIV'] = None + original_exposure.location.dataframe.loc[[3], 'BITIV'] = pd.NaT + original_exposure.location.dataframe.loc[[4], 'BITIV'] = '' original_exposure.ri_info.dataframe.drop(columns='RiskLevel', inplace=True) with tempfile.TemporaryDirectory() as tmp_dir: