Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/3-e2e-test add basic e2e test #23

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ data/
__pycache__
config/*
!config/default.yaml
!config/test.yaml
17 changes: 17 additions & 0 deletions config/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
start_year: 2016
end_year: 2016
start_month: 12
end_month: 13
input_dir: "./test_data/"
prefix: "CEH-GEAR-1hr-v2_"
suffix: "-1-hr.nc"
target_root: "./test_data/"
store_name: "output.zarr"
target_chunks:
time: 8766
y: 100
x: 100
bnds: 2
num_workers: 1
prune: 12 # no. of files to process, set to 0 to use all

72 changes: 38 additions & 34 deletions scripts/GEAR/convert_GEAR_beam.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,6 @@

from GEAR_config import load_yaml_config

if len(sys.argv) != 2:
print("Usage: python scripts/convert_GEAR_beam.py <path_to_yaml_file>")
sys.exit(1)

file_path = sys.argv[1]
config = load_yaml_config(file_path)

if not os.path.exists(config.target_root):
os.makedirs(config.target_root)

def make_path(time):
filename = config.prefix + time + config.suffix
print(f"FILENAME: {filename}")
return os.path.join(config.input_dir, filename)

years = list(range(config.start_year, config.end_year + 1))
months = list(range(config.start_month, config.end_month))
ymonths = [f"{year}{month:02d}" for year in years for month in months]
time_concat_dim = ConcatDim("time", ymonths)

pattern = FilePattern(make_path, time_concat_dim)
if config.prune > 0:
pattern = pattern.prune(nkeep=config.prune)

# Add in our own custom Beam PTransform (Parallel Transform) to apply
# some preprocessing to the dataset. In this case to convert the
# 'bounds' variables to coordinate rather than data variables.
Expand Down Expand Up @@ -82,7 +58,27 @@ def _datavar_to_coordvar(item: Indexed[T]) -> Indexed[T]:
def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
return pcoll | beam.Map(self._datavar_to_coordvar)

recipe = (
def main(config_file_path):
config = load_yaml_config(config_file_path)

if not os.path.exists(config.target_root):
os.makedirs(config.target_root)

def make_path(time):
filename = config.prefix + time + config.suffix
print(f"FILENAME: {filename}")
return os.path.join(config.input_dir, filename)

years = list(range(config.start_year, config.end_year + 1))
months = list(range(config.start_month, config.end_month))
ymonths = [f"{year}{month:02d}" for year in years for month in months]
time_concat_dim = ConcatDim("time", ymonths)

pattern = FilePattern(make_path, time_concat_dim)
if config.prune > 0:
pattern = pattern.prune(nkeep=config.prune)

recipe = (
beam.Create(pattern.items())
| OpenWithXarray(file_type=pattern.file_type)
| DataVarToCoordVar()
Expand All @@ -96,12 +92,20 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
| ConsolidateMetadata()
)

if config.num_workers > 1:
beam_options = PipelineOptions(
direct_num_workers=config.num_workers, direct_running_mode="multi_processing"
)
with beam.Pipeline(options=beam_options) as p:
p | recipe
else:
with beam.Pipeline() as p:
p | recipe
if config.num_workers > 1:
beam_options = PipelineOptions(
direct_num_workers=config.num_workers, direct_running_mode="multi_processing"
)
with beam.Pipeline(options=beam_options) as p:
p | recipe
else:
with beam.Pipeline() as p:
p | recipe

if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python scripts/convert_GEAR_beam.py <path_to_yaml_file>")
sys.exit(1)

config_file_path = sys.argv[1]
main(config_file_path)
46 changes: 46 additions & 0 deletions scripts/GEAR/test_convert_GEAR_beam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import convert_GEAR_beam
import unittest
import shutil
import zarr
import numpy as np

config_file_path = "config/test.yaml"
test_data_output = "test_data/output.zarr"
expected_data_output = "test_data/expected.zarr"

class TestConvertsToZarr(unittest.TestCase):
def setUp(self):
# delete zarr if exists
try:
shutil.rmtree(test_data_output)
except FileNotFoundError:
pass

def tearDown(self):
# delete zarr if exists
try:
shutil.rmtree(test_data_output)
except FileNotFoundError:
pass

def test_convert_to_zarr(self):
# convert test data to zarr
convert_GEAR_beam.main(config_file_path)

# read in resulting zarr file
result = zarr.open(test_data_output, mode='r')

# read in expected zarr file
expected = zarr.open(expected_data_output, mode='r')

# compare result and expected zarr
timeCheck = np.all(result['time'] == expected['time'])
yCheck = np.all(result['y'] == expected['y'])
xCheck = np.all(result['x'] == expected['x'])

self.assertTrue(timeCheck)
self.assertTrue(yCheck)
self.assertTrue(xCheck)

if __name__ == "__main__":
unittest.main()
Binary file added test_data/CEH-GEAR-1hr-v2_201612-1-hr.nc
Binary file not shown.
36 changes: 36 additions & 0 deletions test_data/expected.zarr/.zattrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"Conventions": "CF-1.6",
"acknowledgement": "This research forms part of the SINATRA project which is supported by the United Kingdom NERC Flooding from Intense Rainfall programme (grant NE/K00896X/1) and an associated knowledge exchange award. It was also funded as part of the CONVEX project which was supported by the United Kingdom NERC Changing Water Cycle programme (grant NE/I006680/1) and the INTENSE project through the European Research Council (grant ERC-2013-CoG-617329). The update of CEH-GEAR-1hr (version 2) was funded by NERC Hydro?JULES programme (grant NE/S017380/1)",
"cdm_data_type": "Grid",
"contributor_name": "Lewis, E., Quinn, N., Blenkinsop, S., Fowler, H.J., Freer, J., Tanguy, M., Hitt, O., Coxon, G., Bates, P., Woods, R., Fry, M., Chevuturi, A., Swain, O., White, S.M.",
"coordinates": "crs time_bnds x_bnds y_bnds",
"creator_email": "[email protected]",
"creator_institution": "UK Centre for Ecology & Hydrology (UKCEH)",
"creator_name": "Tanguy, M.",
"date_created": "2021-11-24",
"geospatial_lat_max": 61.026796,
"geospatial_lat_min": 49.766807,
"geospatial_lon_max": 3.554013,
"geospatial_lon_min": -7.55716,
"history": "File created on 2021-11-24",
"id": "https://doi.org/10.5285/fc9423d6-3d54-467f-bb2b-fc7357a3941f",
"institution": "Newcastle University",
"keywords": "rainfall, precipitation, nearest neighbour interpolation, flood, storm, sub-daily, hourly",
"licence": "This dataset is available under the terms of the Open Government Licence https://eidc.ceh.ac.uk/licences/OGL/plain",
"metadata_link": "",
"naming_authority": "DataCITE",
"publisher_institution": "NERC Environmental Information Data Centre",
"references": "Elizabeth Lewis, Niall Quinn, Stephen Blenkinsop, Hayley J. Fowler, Jim Freer, Maliko Tanguy, Olivia Hitt, Gemma Coxon, Paul Bates, Ross Woods. 2018. A rule based quality control method for hourly rainfall data and a 1km resolution gridded hourly rainfall dataset for Great Britain: CEH-GEAR1hr. Journal of Hydrology, 564, 930-943, https://doi.org/10.1016/j.jhydrol.2018.07.034.",
"source": "This dataset has been generated from a number of input datasets. Sub-daily raingauge data from the Met Office, the Scottish Environment Protection Agency (SEPA), the Environment Agency (EA) and Natural Resources Wales (NRW) were used to disaggregate the daily CEH-GEAR rainfall dataset (https://doi.org/10.5285/ee9ab43d-a4fe-4e73-afd5-cd4fc4c82556) into hourly timesteps.",
"spatial_resolution_distance": 1000.0,
"spatial_resolution_unit": "urn:ogc:def:uom:EPSG::9001",
"standard_name_url_vocabulary": "NERC Vocabulary Server, https://vocab.nerc.ac.uk/standard_name/",
"standard_name_vocabulary": "CF Standard Name Table v70, http://cfconventions.org/standard-names.html",
"summary": "The CEH-GEAR1hr-v2 dataset contains 1-km grids of hourly rainfall estimates for GB for the period 1990-2017. The gridded rainfall estimates are derived by applying the nearest neighbour interpolation method to hourly raingauge observations. These interpolated hourly estimates were then used to temporally disaggregate the daily CEH-GEAR dataset (https://doi.org/10.5285/33604ea0-c238-4488-813d-0ad9ab7c51ca). The dataset also contains, for each day, a grid containing, for each grid point, the distance between the grid point and the closest recording raingauge used in its interpolation. When this distance is greater than 50 km, or there is 0mm rainfall recorded in the closest gauge the daily value is disaggregated using a design storm. The dataset therefore also contains for each day, a grid containing for each grid point a flag showing if the design storm was used. These grids are provided as an indicator of the quality of the estimates.",
"time_coverage_duration": "P27Y",
"time_coverage_end": "2016-12-31 23:00:00 UTC",
"time_coverage_resolution": "P1H",
"time_coverage_start": "2016-12-01 00:00:00 UTC",
"title": "Gridded estimates of hourly areal rainfall for Great Britain (1990-2016) - version 2 [CEH-GEAR1hr.v2]",
"version": "v2"
}
3 changes: 3 additions & 0 deletions test_data/expected.zarr/.zgroup
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"zarr_format": 2
}
Loading