Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract cubewrite fill value modifications #114

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
67 changes: 67 additions & 0 deletions test/test_um2netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import iris.coords
import iris.exceptions

from netCDF4 import default_fillvals

D_LAT_N96 = 1.25 # Degrees between latitude points on N96 grid
D_LON_N96 = 1.875 # Degrees between longitude points on N96 grid

Expand Down Expand Up @@ -1207,3 +1209,68 @@ def test_convert_32_bit_with_float64(ua_plev_cube):
ua_plev_cube.data = array
um2nc.convert_32_bit(ua_plev_cube)
assert ua_plev_cube.data.dtype == np.float32


@pytest.mark.parametrize(
"cube_data, expected_fill_val",
[
(np.array([1.1, 2.1], dtype="float32"),
np.float32(um2nc.DEFAULT_FILL_VAL_FLOAT)),
(np.array([1.1, 2.1], dtype="float64"),
np.float64(um2nc.DEFAULT_FILL_VAL_FLOAT)),
(np.array([1.1, 2.1], dtype="complex64"),
np.complex64(default_fillvals["c8"])),
(np.array([1, 2], dtype="int32"),
np.int32(default_fillvals["i4"])),
(np.array([1, 2], dtype="int64"),
np.int64(default_fillvals["i8"]))
]
)
def test_fix_fill_value_defaults(cube_data, expected_fill_val):
"""
Check that correct default fill values are found based
on a cube's data's type.
"""
fake_cube = DummyCube(12345, "fake_var", attributes={})
fake_cube.data = cube_data

fill_value = um2nc.fix_fill_value(fake_cube)

assert fill_value == expected_fill_val
# Check new fill value type matches cube's data's type
assert type(fill_value) == cube_data.dtype

# Check that missing value attribute set to expected fill_value
assert fake_cube.attributes["missing_value"][0] == expected_fill_val


def test_fix_fill_value_custom():
"""
Check that custom fill values overwrite the defaults.
"""
fake_cube = DummyCube(12345, "fake_var", attributes={})
fake_cube.data = np.array([1.1, 2.2], dtype="float64")
custom_val = np.float64(123.456)

fill_value = um2nc.fix_fill_value(fake_cube, custom_fill_value=custom_val)

# Check that the custom fill value is returned
assert fill_value == custom_val

# Check that missing value attribute set to the custom fill value
assert fake_cube.attributes["missing_value"][0] == custom_val


def test_fix_fill_value_wrong_type():
"""
Check that an error is raised when supplied fill value's
type does not match the cube's data's type.
"""
fake_cube = DummyCube(12345, "fake_var", attributes={})
fake_cube.data = np.array([1, 2, 3], dtype="int32")
custom_val = np.float32(151.11)

assert fake_cube.data.dtype != custom_val.dtype

with pytest.raises(TypeError):
um2nc.fix_fill_value(fake_cube, custom_fill_value=custom_val)
72 changes: 63 additions & 9 deletions umpost/um2netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@
LEVEL_HEIGHT = "level_height"
SIGMA = "sigma"

DEFAULT_FILL_VAL_FLOAT = 1.e20


class PostProcessingError(Exception):
"""Generic class for um2nc specific errors."""
Expand Down Expand Up @@ -319,6 +321,66 @@ def fix_latlon_coords(cube, grid_type, dlat, dlon):
fix_lon_coord_name(longitude_coordinate, grid_type, dlon)


def get_default_fill_value(cube):
"""
Get the default fill value for a cube's data's type.

Parameters
----------
cube: Iris cube object.

Returns
-------
fill_value: numpy scalar with type matching cube.data
"""
if cube.data.dtype.kind == 'f':
fill_value = DEFAULT_FILL_VAL_FLOAT

else:
fill_value = default_fillvals[
f"{cube.data.dtype.kind:s}{cube.data.dtype.itemsize:1d}"
]

# NB: the `_FillValue` attribute appears to be converted to match the
# cube data's type externally (likely in the netCDF4 library). It's not
# strictly necessary to do the conversion here. However, we need
# the separate `missing_value` attribute to have the correct type
# and so it's cleaner to set the type for both here.

# TODO: Is there a cleaner way do do the following conversion?
return np.array([fill_value], dtype=cube.data.dtype)[0]


def fix_fill_value(cube, custom_fill_value=None):
"""
Set a cube's missing_value attribute.

Parameters
----------
cube: Iris cube object (modified in place).
custom_fill_value: Fill value to use in place of defaults.
Type must match cube data's type.
"""
if custom_fill_value is not None:
fill_value = custom_fill_value
else:
fill_value = get_default_fill_value(cube)

if type(fill_value) == cube.data.dtype:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My linter (flake8) complains here, suggesting do not compare types, for exact checks use "is" / "is not", for instance checks use "isinstance()"

I've had a play around, however neither is or isinstance work as desired when using cube.data.dtype in the comparison. Would be keen to gather any suggestions!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... neither is or isinstance work as desired when using cube.data.dtype in the comparison. Would be keen to gather any suggestions!

is will fail if the instances are not the same object.

The second aspect is numpy using internal types in place of Python's built in primitives:

>>> type(cube.data.dtype)
numpy.dtypes.Float64DType
>>> type(1.0)
<class 'float'>

That affects comparison operations (more on that in another comment).

# TODO: Is placing the fill value in an array still necessary,
# given the added fill value checks?

# Use an array to force the type to match the data type
cube.attributes['missing_value'] = np.array([fill_value],
cube.data.dtype)
return fill_value

else:
msg = (f"fill_val type {type(fill_value)} does not "
f"match cube {cube.name()} data type {cube.data.dtype}.")
raise TypeError(msg)


# TODO: split cube ops into functions, this will likely increase process() workflow steps
def cubewrite(cube, sman, compression, use64bit, verbose):
# TODO: move into process() AND if a new cube is returned, swap into filtered cube list
Expand All @@ -328,15 +390,7 @@ def cubewrite(cube, sman, compression, use64bit, verbose):
if not use64bit:
convert_32_bit(cube)

# Set the missing_value attribute. Use an array to force the type to match
# the data type
if cube.data.dtype.kind == 'f':
fill_value = 1.e20
else:
# Use netCDF defaults
fill_value = default_fillvals['%s%1d' % (cube.data.dtype.kind, cube.data.dtype.itemsize)]

cube.attributes['missing_value'] = np.array([fill_value], cube.data.dtype)
fill_value = fix_fill_value(cube)

# If reference date is before 1600 use proleptic gregorian
# calendar and change units from hours to days
Expand Down
Loading