Skip to content

Commit

Permalink
Update 17_slp6_stacs.py
Browse files Browse the repository at this point in the history
Blind copy of AR5 script, to be adjusted
  • Loading branch information
EtienneKras committed Jul 26, 2023
1 parent cd7fc61 commit 4a03f08
Showing 1 changed file with 118 additions and 129 deletions.
247 changes: 118 additions & 129 deletions scripts/17_slp6_stacs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pathlib
import sys
from re import S, template
import json

# make modules importable when running this file as script
sys.path.append(str(pathlib.Path(__file__).parent.parent))
Expand All @@ -14,6 +15,7 @@
import pandas as pd
import pystac
import rasterio
import rioxarray as rio
import shapely
import xarray as xr
from datacube.utils.cog import write_cog
Expand All @@ -33,6 +35,7 @@
gen_zarr_asset,
get_template_collection,
)
from stac.coclico_extension import CoclicoExtension # self built stac extension


def cftime_to_pdts(t: cftime._cftime) -> pd.Timestamp:
Expand Down Expand Up @@ -64,75 +67,7 @@ def name_tif(da: xr.DataArray, prefix: str = "", scenario: str = "") -> str:
return str(blob_name)


def make_cf_compliant(ds: xr.Dataset) -> xr.Dataset:
# TODO: check with etienne if nv is also used for time bounds
ds = ds.rename_dims({"ens": "nensemble", "bnds": "nv"}) # nv = number of vertices

ds = ds.rename_vars({"modelname": "ensemble"})

# # set some data variables to coordinates to avoid duplication of dimensions in later stage
ds = ds.set_coords(["ensemble", "time_bnds"])

# TODO: check with Etienne why this is required
ds.time_bnds.encoding[
"_FillValue"
] = None # xarray sets _FillValue automatically to None for float types, prevent this when needed

# TODO: check with Etienne if equal dimensions for ensembles are also necessary for cog's - I don't think so.
# code can be found in notebook file.

# remove extra spaces in model names and use the updated values as coordinate axis for ensemble
model_names = np.array(
[s.strip() for s in ds["ensemble"].astype(str).values], dtype="S"
)
ds = ds.assign_coords(ensemble=("nensemble", model_names))

# this long name is copied from netcdf description
ds["ensemble"].attrs[
"long_name"
] = "Model names in the same order as in totslr_ens var"

# add some extra attributes (found at https://www.cen.uni-hamburg.de/en/icdc/data/ocean/ar5-slr.html#zugang)
ds.attrs["title"] = "Sea level rise from AR5"
ds.attrs["institution"] = "Institute of Oceanography / CEN / University of Hamburg"
ds.attrs["comment"] = (
"Here are the files which contain the ocean and ice components, sums and"
" uncertainties as used in preparing the IPCC AR5 report (2014), with some"
" slight modifications. One small choice I made here was to combine the ocean"
" and inverse barometer effect into one field, both for the mean and the"
" uncertainty. I also have provided better smoothed maps for the *time series*"
" (the 20-mean-difference maps are the same as in the report). This actually"
" shouldn't be the cause for any difference in the report figures, as I didn't"
" use this field for anything but the coastal stations in Fig. 13.23, and they"
" have the same interpolation scheme at the coast now, just a better"
" interpolation scheme in the open ocean (bilinear; not shown in any figure in"
" the report). "
"\n"
"One thing to note: I made a choice as to how to provide the 5%"
" and 95% (upper and lower 90 %% confidence interval) uncertainty estimates for"
" the various fields. I have provided the maps of these similar to the way"
" Jonathan Gregory provided the time series to me, as the individual component"
" upper and lower bounds. However, to combine these errors in the same way as"
" in the report, you will need to take the difference between the upper bound"
" and the middle value (for combining the upper uncertainty total estimate) or"
" the lower bound and middle value (for combining the lower uncertainty total"
" estimate), and use the formula shown in the Supplementary Material for"
" Chapter 13 of the AR5 (SM13) - minus the IBE which is combined with the ocean"
" field here."
)
ds.attrs["references"] = (
"Chapter 13 paper: Church, J. A., P. Clark, A. Cazenave, J. Gregory, S."
" Jevrejeva, A. Levermann, M. Merrifield, G. Milne, R.S.Nerem, P. Nunn, A."
" Payne, W. Pfeffer, D. Stammer, and A. Unnikrishnan (2013), Sea level change,"
" in Climate Change 2013: The Physical Science Basis, edited by T. F. Stocker,"
" D. Qin, G.-K. Plattner, M. Tignor, S. Allen, J. Boschung, A. Nauels, Y. Xia,"
" V. Bex, and P. Midgley, Cambridge University Press, Cambridge, UK and New"
" York, NY. USA."
)

return ds


# TODO: move itemize to ETL or stac.blueprint when generalized
def itemize(
da,
item: pystac.Item,
Expand All @@ -152,8 +87,10 @@ def itemize(
item.id = blob_name
item.geometry = geometry
item.bbox = bbox
item.datetime = cftime_to_pdts(da["time"].item()).to_pydatetime()
# item.datetime = pd.Timestamp(da.coords[time_dim].item()).to_pydatetime()
item.datetime = pd.Timestamp(
da["time"].item().decode("utf-8")
).to_pydatetime() # dataset specific
# item.datetime = cftime_to_pdts(da["time"].item()).to_pydatetime() # dataset specific

ext = pystac.extensions.projection.ProjectionExtension.ext(
item, add_if_missing=True
Expand All @@ -168,8 +105,13 @@ def itemize(
roles = asset_roles or ["data"]

href = os.path.join(
GCS_PROTOCOL, BUCKET_NAME, BUCKET_PROJ, DATASET_FILENAME, blob_name
GCS_PROTOCOL,
BUCKET_NAME,
BUCKET_PROJ,
metadata["TITLE_ABBREVIATION"],
blob_name,
)

# TODO: We need to generalize this `href` somewhat.
asset = pystac.Asset(
href=href,
Expand All @@ -191,20 +133,31 @@ def itemize(

STAC_DIR = "current"
TEMPLATE_COLLECTION = "template" # stac template for dataset collection
COLLECTION_ID = "slp" # name of stac collection
COLLECTION_TITLE = "AR5 Sea level change projections"
DATASET_DESCRIPTION = "AR5 Sea level change projections"
COLLECTION_ID = "slp5" # name of stac collection

# hard-coded input params which differ per dataset
DATASET_FILENAME = "ar5_slp"
METADATA = "metadata_AR5_slp.json"
DATASET_DIR = "18_AR5_SLP_IPCC"
CF_FILE = "total-ens-slr_CF.nc"

# these are added at collection level, determine dashboard graph layout using all items
PLOT_SERIES = "scenarios"
PLOT_X_AXIS = "time"
PLOT_TYPE = "line"
MIN = 0
MAX = 3
LINEAR_GRADIENT = [
{"color": "hsl(110,90%,80%)", "offset": "0.000%", "opacity": 100},
{"color": "hsla(55,88%,53%,0.5)", "offset": "50.000%", "opacity": 100},
{"color": "hsl(0,90%,70%)", "offset": "100.000%", "opacity": 100},
]

# define local directories
home = pathlib.Path().home()
tmp_dir = home.joinpath("data", "tmp")

# remote p drive
coclico_data_dir = p_drive.joinpath("11205479-coclico", "FASTTRACK_DATA")
coclico_data_dir = p_drive.joinpath(
"11205479-coclico", "FASTTRACK_DATA"
) # remote p drive

# use local or remote data dir
use_local_data = False
Expand All @@ -217,9 +170,13 @@ def itemize(
if not ds_dir.exists():
raise FileNotFoundError(f"Data dir does not exist, {str(ds_dir)}")

# directory to export result (make if not exists)
cog_dir = ds_dir.joinpath("cogs")
cog_dir.mkdir(parents=True, exist_ok=True)
# directory to export result
cog_dirs = ds_dir.joinpath("cogs")

# load metadata template
metadata_fp = ds_dir.joinpath(METADATA)
with open(metadata_fp, "r") as f:
metadata = json.load(f)

catalog = Catalog.from_file(os.path.join(rel_root, STAC_DIR, "catalog.json"))

Expand All @@ -231,39 +188,49 @@ def itemize(
collection = get_template_collection(
template_fp=template_fp,
collection_id=COLLECTION_ID,
title=COLLECTION_TITLE,
description=DATASET_DESCRIPTION,
title=metadata["TITLE"],
description=metadata["SHORT_DESCRIPTION"],
keywords=metadata["KEYWORDS"],
# license=metadata["LICENSE"],
# spatial_extent=metadata["SPATIAL_EXTENT"],
# temporal_extent=metadata["TEMPORAL_EXTENT"],
# providers=metadata["PROVIDERS"],
)

layout = LayoutCoG()

# the dataset contains three different rcp scenario's
RCPS = ["26", "45", "85"]
for rcp in RCPS:
# load dataset and do some pre-processsing
ds_fp = ds_dir.joinpath(f"total-ens-slr-{rcp}-5.nc")
ds = xr.open_dataset(ds_fp)
# open the dataset
ds_fp = ds_dir.joinpath(CF_FILE)
ds = xr.open_dataset(ds_fp)

# loop over gdal / nc folder struct (max depth = 3)
# NOTE, does not work as we open the geotiff which contains to little information to itemize objects (need dataArrays)
# for scen_str in os.listdir(cog_dirs): # search in the scenarios (the highest level)

# for var_ens in os.listdir(
# cog_dirs.joinpath(scen_str)
# ): # search in the variables / ensembles (the middle level)

# format dataset based upon notebooks/18_SLR_AR5.ipynb
ds = make_cf_compliant(ds)
# TODO: write CF compliancy output file?
# for fname in os.listdir(
# cog_dirs.joinpath(scen_str, var_ens)
# ): # search in the file name (the lowest level)

# # convert cf noleap yrs to datetime
# ds["time"] = ds.indexes["time"].to_datetimeindex()
# blob_name = pathlib.Path(scen_str, var_ens, fname)
# outpath = cog_dirs.joinpath(blob_name)
# da = rio.open_rasterio(outpath, masked=True)

# add crs and spatial dims
ds.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
if not ds.rio.crs:
ds = ds.rio.write_crs("EPSG:4326")
# TODO: check what we can generalize and put into a function for temporal CoGs
# loop over CF compliant NC file, same code as in related .ipynb
for idx, scen in enumerate(ds["scenarios"].values):
rcp = scen.decode("utf-8")

# format rcp name for filenaming
rcp_name = f"rcp={rcp}"
rcp_name = "rcp=%s" % rcp.strip("RCP")
print(rcp_name)

# extract list of data variables
variables = set(ds.variables) - set(ds.dims) - set(ds.coords)

ds["ensemble"] = ds.coords["ensemble"].astype(str)

ntimes = ds.dims["time"]
for ntime in range(ntimes):
ds2 = ds.copy()
Expand All @@ -277,35 +244,57 @@ def itemize(

for var_name in variables:
# time bounds are extracted, so nv dim can be dropped, as tiff should be 2D or 3D.
da = ds2[var_name]

# compose tif name
fname = time_name + ".tif"
blob_name = pathlib.Path(rcp_name, var_name, fname)
outpath = cog_dir.joinpath(blob_name)

# make parent dir if not exists
outpath.parent.mkdir(parents=True, exist_ok=True)

# add time name as scalar variable to tif
da["time_bnds"] = time_name

template_item = pystac.Item(
"id", None, None, datetime.datetime(2000, 1, 1), {}
)

item = itemize(da, template_item, blob_name=str(blob_name))
collection.add_item(item, strategy=layout)

# set overwrite is false because tifs should be unique
try:
write_cog(da, fname=outpath, overwrite=False)
except OSError as e:
continue
da = ds2.sel({"nscenarios": idx})[var_name]

for idv, ens in enumerate(da["ensemble"].values):
da2 = da.isel({"ensemble": idv})

# add crs and spatial dims
da2.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
if not da2.rio.crs:
da2 = da2.rio.write_crs("EPSG:4326")

# reset some variables and attributes
da2["time"] = np.array(
cftime_to_pdts(da2["time"].item()).strftime(
"%Y-%m-%d %H:%M:%S"
),
dtype="S",
)
da2["time"].attrs["standard_name"] = "time"
da2["time_bnds"] = np.array(time_name, dtype="S")
da2["time_bnds"].attrs["long_name"] = "time boundaries"

# compose tif name
fname = time_name + ".tif"
blob_name = pathlib.Path(
rcp_name, var_name + "_ens%s" % int(ens), fname
)
outpath = cog_dirs.joinpath(blob_name)
template_item = pystac.Item(
"id", None, None, datetime.datetime(2000, 1, 1), {}
)

item = itemize(da2, template_item, blob_name=str(blob_name))
collection.add_item(item, strategy=layout)

# TODO: use gen_default_summaries() from blueprint.py after making it frontend compliant.
collection.summaries = Summaries({})

# this calls CollectionCoclicoExtension since stac_obj==pystac.Collection
coclico_ext = CoclicoExtension.ext(collection, add_if_missing=True)

# Add frontend properties defined above to collection extension properties. The
# properties attribute of this extension is linked to the extra_fields attribute of
# the stac collection.
coclico_ext.units = metadata["UNITS"]
coclico_ext.plot_series = PLOT_SERIES
coclico_ext.plot_x_axis = PLOT_X_AXIS
coclico_ext.plot_type = PLOT_TYPE
coclico_ext.min_ = MIN
coclico_ext.max_ = MAX
coclico_ext.linear_gradient = LINEAR_GRADIENT

# add collection to catalog
catalog.add_child(collection)

Expand All @@ -329,9 +318,9 @@ def itemize(
)

dir_to_google_cloud(
dir_path=str(cog_dir),
dir_path=str(cog_dirs),
gcs_project=GCS_PROJECT,
bucket_name=BUCKET_NAME,
bucket_proj=BUCKET_PROJ,
dir_name=DATASET_FILENAME,
dir_name=metadata["TITLE_ABBREVIATION"],
)

0 comments on commit 4a03f08

Please sign in to comment.