Merge pull request #116 from OasisLMF/release/3.1.5

Release 3.1.5 (30th May 2024)
OasisLMF · May 30, 2024 · 9b72eb5 · 9b72eb5
2 parents 3c9a3a2 + 72fa3ee
commit 9b72eb5
Show file tree

Hide file tree

Showing 8 changed files with 87 additions and 25 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,14 @@
 ODS_Tools Changelog
 ===================
 
+`3.1.5`_
+ ---------
+* [#80](https://github.com/OasisLMF/ODS_Tools/pull/80) - Added fields to analysis settings schema
+* [#115](https://github.com/OasisLMF/ODS_Tools/pull/115) - fix issue when categorical column have a default value
+* [#117](https://github.com/OasisLMF/ODS_Tools/pull/117) - Fix/fill empty
+* [#78](https://github.com/OasisLMF/ODS_Tools/pull/78) - Release 3.1.4
+.. _`3.1.5`:  https://github.com/OasisLMF/ODS_Tools/compare/3.1.4...3.1.5
+
 .. _`3.1.4`:  https://github.com/OasisLMF/ODS_Tools/compare/3.1.3...3.1.4
 
 `3.1.3`_

diff --git a/ods_tools/__init__.py b/ods_tools/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '3.1.4'
+__version__ = '3.1.5'
 
 import logging
 

diff --git a/ods_tools/oed/__init__.py b/ods_tools/oed/__init__.py
@@ -5,13 +5,13 @@
 from .common import (
     OdsException, PANDAS_COMPRESSION_MAP, PANDAS_DEFAULT_NULL_VALUES, USUAL_FILE_NAME, OED_TYPE_TO_NAME,
     OED_NAME_TO_TYPE, OED_IDENTIFIER_FIELDS, VALIDATOR_ON_ERROR_ACTION, DEFAULT_VALIDATION_CONFIG, OED_PERIL_COLUMNS, fill_empty,
-    UnknownColumnSaveOption, BLANK_VALUES
+    UnknownColumnSaveOption, BLANK_VALUES, is_empty
 )
 
 
 __all__ = [
     'OedExposure', 'OedSchema', 'OedSource', 'ModelSettingSchema', 'AnalysisSettingSchema',
     'OdsException', 'PANDAS_COMPRESSION_MAP', 'PANDAS_DEFAULT_NULL_VALUES', 'USUAL_FILE_NAME', 'OED_TYPE_TO_NAME',
     'OED_NAME_TO_TYPE', 'OED_IDENTIFIER_FIELDS', 'VALIDATOR_ON_ERROR_ACTION', 'DEFAULT_VALIDATION_CONFIG', 'OED_PERIL_COLUMNS', 'fill_empty',
-    'UnknownColumnSaveOption', 'BLANK_VALUES'
+    'UnknownColumnSaveOption', 'BLANK_VALUES', 'is_empty'
 ]
diff --git a/ods_tools/oed/common.py b/ods_tools/oed/common.py
@@ -143,14 +143,27 @@ def __get__(self, obj, type=None):
 
 BLANK_VALUES = {np.nan, '', None, pd.NA, pd.NaT}
 
+dtype_to_python = {
+    'Int8': int,
+    'Int32': int,
+    'Int64': int,
+    'bytes': lambda x: bytes(x, 'utf-8'),
+    'float64': float,
+    'category': str
+}
+
+
+def is_empty(df, columns):
+    return (df[columns].isnull()) | (df[columns] == '')
+
 
 def fill_empty(df, columns, value):
     if isinstance(columns, str):
         columns = [columns]
     for column in columns:
         if df[column].dtypes.name == 'category' and value not in {None, np.nan}.union(df[column].cat.categories):
             df[column] = df[column].cat.add_categories(value)
-        df.loc[df[column].isin(BLANK_VALUES), column] = value
+        df.loc[is_empty(df, column), column] = value
 
 
 class UnknownColumnSaveOption(Enum):

diff --git a/ods_tools/oed/oed_schema.py b/ods_tools/oed/oed_schema.py
@@ -4,7 +4,7 @@
 import numba as nb
 import numpy as np
 
-from .common import OdsException, BLANK_VALUES, cached_property
+from .common import OdsException, BLANK_VALUES, cached_property, dtype_to_python
 
 ENV_ODS_SCHEMA_PATH = os.getenv('ODS_SCHEMA_PATH')
 
@@ -121,6 +121,25 @@ def peril_filtering(self, peril_ids, peril_filters):
         """
         return jit_peril_filtering(peril_ids.to_numpy().astype('str'), peril_filters.to_numpy().astype('str'), self.nb_perils_dict)
 
+    @staticmethod
+    def get_default_from_ods_fields(ods_fields, field_name):
+        field_info = ods_fields.get(field_name.lower())
+        if field_info is None:
+            return ''
+        if field_info['pd_dtype'] == 'category':
+            if field_info['Default'] != 'n/a':
+                return field_info['Default']
+            else:
+                return ''
+        else:
+            if field_info['Default'] != 'n/a':
+                return dtype_to_python[field_info['pd_dtype']](field_info['Default'])
+            else:
+                return np.nan
+
+    def get_default(self, field_name, oed_type='null'):
+        return self.get_default_from_ods_fields(self.schema['input_fields'][oed_type], field_name)
+
     @staticmethod
     def to_universal_field_name(column: str):
         """

diff --git a/ods_tools/oed/source.py b/ods_tools/oed/source.py
@@ -6,8 +6,8 @@
 import numpy as np
 from chardet.universaldetector import UniversalDetector
 
-from .common import (OED_TYPE_TO_NAME, OdsException, PANDAS_COMPRESSION_MAP, PANDAS_DEFAULT_NULL_VALUES, is_relative, BLANK_VALUES, fill_empty,
-                     UnknownColumnSaveOption, cached_property)
+from .common import (OED_TYPE_TO_NAME, OdsException, PANDAS_COMPRESSION_MAP, PANDAS_DEFAULT_NULL_VALUES, is_relative, fill_empty,
+                     UnknownColumnSaveOption, cached_property, is_empty)
 from .forex import convert_currency
 from .oed_schema import OedSchema
 
@@ -267,7 +267,7 @@ def as_oed_type(cls, oed_df, column_to_field):
                 if oed_df[column].dtype.name == 'category' and '' not in oed_df[column].dtype.categories:
                     oed_df[column] = oed_df[column].cat.add_categories('')
                 oed_df[column] = oed_df[column]  # make a copy f the col in case it is read_only
-                oed_df.loc[oed_df[column].isin(BLANK_VALUES), column] = ''
+                oed_df.loc[is_empty(oed_df, column), column] = ''
             elif pd_dtype[column].startswith('Int'):
                 to_tmp_dtype[column] = 'float'
 
@@ -287,12 +287,7 @@ def prepare_df(cls, df, column_to_field, ods_fields):
         """
         # set default values
         for col, field_info in column_to_field.items():
-            if (field_info
-                    and field_info['Default'] != 'n/a'
-                    and (df[col].isna().any() or (field_info['pd_dtype'] == 'category' and df[col].isnull().any()))):
-                fill_empty(df, col, df[col].dtype.type(field_info['Default']))
-            elif df[col].dtype.name == 'category':
-                fill_empty(df, col, '')
+            fill_empty(df, col, OedSchema.get_default_from_ods_fields(ods_fields, col))
 
         # add required columns that allow blank values if missing
         present_field = set(field_info['Input Field Name'] for field_info in column_to_field.values())

diff --git a/ods_tools/oed/validator.py b/ods_tools/oed/validator.py
@@ -1,12 +1,13 @@
 import functools
 import json
+import numpy as np
 import logging
 
 from pathlib import Path
 from collections.abc import Iterable
 
 from .common import (OdsException, OED_PERIL_COLUMNS, OED_IDENTIFIER_FIELDS, DEFAULT_VALIDATION_CONFIG,
-                     VALIDATOR_ON_ERROR_ACTION, BLANK_VALUES)
+                     VALIDATOR_ON_ERROR_ACTION, BLANK_VALUES, is_empty)
 from .oed_schema import OedSchema
 
 logger = logging.getLogger(__name__)
@@ -134,7 +135,7 @@ def check_required_fields(self):
                     columns = [columns]
                 for column in columns:
                     if field_info.get("Allow blanks?").upper() == 'NO':
-                        missing_value_df = oed_source.dataframe[oed_source.dataframe[column].isin(BLANK_VALUES)]
+                        missing_value_df = oed_source.dataframe[is_empty(oed_source.dataframe, column)]
                         if not missing_value_df.empty:
                             invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source,
                                                  'msg': f"column '{column}' has missing values in \n"
@@ -222,8 +223,9 @@ def check_occupancy_code(self):
             if occupancy_code_column is None:
                 continue
             identifier_field = self.identifier_field_maps[oed_source]
-            invalid_occupancy_code = oed_source.dataframe[~oed_source.dataframe[occupancy_code_column].astype(str).isin(
-                set(self.exposure.oed_schema.schema['occupancy']) | BLANK_VALUES)]
+            invalid_occupancy_code = oed_source.dataframe[~(np.isin(oed_source.dataframe[occupancy_code_column].astype(str),
+                                                                    list(self.exposure.oed_schema.schema['occupancy']))
+                                                            | is_empty(oed_source.dataframe, occupancy_code_column))]
             if not invalid_occupancy_code.empty:
                 invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source,
                                      'msg': f"invalid OccupancyCode.\n"
@@ -242,8 +244,9 @@ def check_construction_code(self):
             if construction_code_column is None:
                 continue
             identifier_field = self.identifier_field_maps[oed_source]
-            invalid_construction_code = oed_source.dataframe[~oed_source.dataframe[construction_code_column].astype(str).isin(
-                set(self.exposure.oed_schema.schema['construction']) | BLANK_VALUES)]
+            invalid_construction_code = oed_source.dataframe[~(np.isin(oed_source.dataframe[construction_code_column].astype(str),
+                                                                       list(self.exposure.oed_schema.schema['construction']))
+                                                               | is_empty(oed_source.dataframe, construction_code_column))]
             if not invalid_construction_code.empty:
                 invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source,
                                      'msg': f"invalid ConstructionCode.\n"
@@ -265,8 +268,9 @@ def check_country_and_area_code(self):
             identifier_field = self.identifier_field_maps[oed_source]
             area_code_column = self.field_to_column_maps[oed_source].get('AreaCode')
             if area_code_column is not None:
-                country_only_df = oed_source.dataframe[oed_source.dataframe[area_code_column].isin(BLANK_VALUES)]
-                country_area_df = oed_source.dataframe[~oed_source.dataframe[area_code_column].isin(BLANK_VALUES)]
+                country_only_df = oed_source.dataframe[is_empty(oed_source.dataframe, area_code_column)]
+                country_area_df = oed_source.dataframe[~is_empty(oed_source.dataframe, area_code_column)]
+
                 invalid_country_area = (country_area_df[
                     ~(country_area_df[[country_code_column, area_code_column]]
                       .apply(tuple, axis=1)
@@ -279,8 +283,9 @@ def check_country_and_area_code(self):
                                                 f"{invalid_country_area[identifier_field + [country_code_column, area_code_column]]}"})
             else:
                 country_only_df = oed_source.dataframe
-            invalid_country = (country_only_df[~country_only_df[country_code_column]
-                                               .isin(set(self.exposure.oed_schema.schema['country']) | BLANK_VALUES)])
+            invalid_country = (country_only_df[~(np.isin(country_only_df[country_code_column],
+                                                         list(self.exposure.oed_schema.schema['country']))
+                                                 | is_empty(country_only_df, country_code_column))])
             if not invalid_country.empty:
                 invalid_data.append({'name': oed_source.oed_name, 'source': oed_source.current_source,
                                      'msg': f"invalid CountryCode.\n"

diff --git a/tests/test_ods_package.py b/tests/test_ods_package.py
@@ -109,6 +109,24 @@ def test_load_oed_from_config(self):
                 exposure2 = OedExposure(**config)
             self.assertTrue(exposure.location.dataframe.equals(exposure2.location.dataframe))
 
+    def test_categorical_with_default(self):
+        # UseReinsDates is a string column with a non null default, check default setting works
+        with tempfile.TemporaryDirectory() as tmp_run_dir:
+            config = {
+                'ri_info': base_url + '/SourceReinsInfoOEDPiWind.csv',
+                'use_field': True
+            }
+            exposure = OedExposure(**config)
+            exposure.ri_info.dataframe['UseReinsDates'] = None
+            exposure.ri_info.dataframe.to_csv(os.path.join(tmp_run_dir, 'ri_info.csv'), index=False)
+
+            exposure = OedExposure(**{
+                'ri_info': os.path.join(tmp_run_dir, 'ri_info.csv'),
+                'use_field': True
+            })
+            ri_scope = exposure.ri_info.dataframe
+            self.assertTrue(isinstance(ri_scope, pd.DataFrame))
+
     def test_load_oed_from_df(self):
         location_df = pd.DataFrame({
             'PortNumber': [1, 1],
@@ -387,7 +405,11 @@ def test_field_required_allow_blank_are_set_to_default(self):
             'use_field': True})
 
         original_exposure.location.dataframe.drop(columns=['ContentsTIV'], inplace=True)
-        original_exposure.location.dataframe['BITIV'] = np.nan
+        original_exposure.location.dataframe['BITIV'] = pd.NA
+        original_exposure.location.dataframe.loc[[1], 'BITIV'] = np.nan
+        original_exposure.location.dataframe.loc[[2], 'BITIV'] = None
+        original_exposure.location.dataframe.loc[[3], 'BITIV'] = pd.NaT
+        original_exposure.location.dataframe.loc[[4], 'BITIV'] = ''
         original_exposure.ri_info.dataframe.drop(columns='RiskLevel', inplace=True)
 
         with tempfile.TemporaryDirectory() as tmp_dir: