Update 17_slp6_stacs.py

Blind copy of AR5 script, to be adjusted
openearth · Jul 26, 2023 · 4a03f08 · 4a03f08
1 parent cd7fc61
commit 4a03f08
Showing 1 changed file with 118 additions and 129 deletions.
diff --git a/scripts/17_slp6_stacs.py b/scripts/17_slp6_stacs.py
@@ -3,6 +3,7 @@
 import pathlib
 import sys
 from re import S, template
+import json
 
 # make modules importable when running this file as script
 sys.path.append(str(pathlib.Path(__file__).parent.parent))
@@ -14,6 +15,7 @@
 import pandas as pd
 import pystac
 import rasterio
+import rioxarray as rio
 import shapely
 import xarray as xr
 from datacube.utils.cog import write_cog
@@ -33,6 +35,7 @@
     gen_zarr_asset,
     get_template_collection,
 )
+from stac.coclico_extension import CoclicoExtension  # self built stac extension
 
 
 def cftime_to_pdts(t: cftime._cftime) -> pd.Timestamp:
@@ -64,75 +67,7 @@ def name_tif(da: xr.DataArray, prefix: str = "", scenario: str = "") -> str:
     return str(blob_name)
 
 
-def make_cf_compliant(ds: xr.Dataset) -> xr.Dataset:
-    # TODO: check with etienne if nv is also used for time bounds
-    ds = ds.rename_dims({"ens": "nensemble", "bnds": "nv"})  # nv = number of vertices
-
-    ds = ds.rename_vars({"modelname": "ensemble"})
-
-    # # set some data variables to coordinates to avoid duplication of dimensions in later stage
-    ds = ds.set_coords(["ensemble", "time_bnds"])
-
-    # TODO: check with Etienne why this is required
-    ds.time_bnds.encoding[
-        "_FillValue"
-    ] = None  # xarray sets _FillValue automatically to None for float types, prevent this when needed
-
-    # TODO: check with Etienne if equal dimensions for ensembles are also necessary for cog's - I don't think so.
-    # code can be found in notebook file.
-
-    # remove extra spaces in model names and use the updated values as coordinate axis for ensemble
-    model_names = np.array(
-        [s.strip() for s in ds["ensemble"].astype(str).values], dtype="S"
-    )
-    ds = ds.assign_coords(ensemble=("nensemble", model_names))
-
-    # this long name is copied from netcdf description
-    ds["ensemble"].attrs[
-        "long_name"
-    ] = "Model names in the same order as in totslr_ens var"
-
-    # add some extra attributes (found at https://www.cen.uni-hamburg.de/en/icdc/data/ocean/ar5-slr.html#zugang)
-    ds.attrs["title"] = "Sea level rise from AR5"
-    ds.attrs["institution"] = "Institute of Oceanography / CEN / University of Hamburg"
-    ds.attrs["comment"] = (
-        "Here are the files which contain the ocean and ice components, sums and"
-        " uncertainties as used in preparing the IPCC AR5 report (2014), with some"
-        " slight modifications.  One small choice I made here was to combine the ocean"
-        " and inverse barometer effect into one field, both for the mean and the"
-        " uncertainty.  I also have provided better smoothed maps for the *time series*"
-        " (the 20-mean-difference maps are the same as in the report).  This actually"
-        " shouldn't be the cause for any difference in the report figures, as I didn't"
-        " use this field for anything but the coastal stations in Fig. 13.23, and they"
-        " have the same interpolation scheme at the coast now, just a better"
-        " interpolation scheme in the open ocean (bilinear; not shown in any figure in"
-        " the report). "
-        "\n"
-        "One thing to note: I made a choice as to how to provide the 5%"
-        " and 95% (upper and lower 90 %% confidence interval) uncertainty estimates for"
-        " the various fields.  I have provided the maps of these similar to the way"
-        " Jonathan Gregory provided the time series to me, as the individual component"
-        " upper and lower bounds. However, to combine these errors in the same way as"
-        " in the report, you will need to take the difference between the upper bound"
-        " and the middle value (for combining the upper uncertainty total estimate) or"
-        " the lower bound and middle value (for combining the lower uncertainty total"
-        " estimate), and use the formula shown in the Supplementary Material for"
-        " Chapter 13 of the AR5 (SM13) - minus the IBE which is combined with the ocean"
-        " field here."
-    )
-    ds.attrs["references"] = (
-        "Chapter 13 paper: Church, J. A., P. Clark, A. Cazenave, J. Gregory, S."
-        " Jevrejeva, A. Levermann, M. Merrifield, G. Milne, R.S.Nerem, P. Nunn, A."
-        " Payne, W. Pfeffer, D. Stammer, and A. Unnikrishnan (2013), Sea level change,"
-        " in Climate Change 2013: The Physical Science Basis, edited by T. F. Stocker,"
-        " D. Qin, G.-K. Plattner, M. Tignor, S. Allen, J. Boschung, A. Nauels, Y. Xia,"
-        " V. Bex, and P. Midgley, Cambridge University Press, Cambridge, UK and New"
-        " York, NY. USA."
-    )
-
-    return ds
-
-
+# TODO: move itemize to ETL or stac.blueprint when generalized
 def itemize(
     da,
     item: pystac.Item,
@@ -152,8 +87,10 @@ def itemize(
     item.id = blob_name
     item.geometry = geometry
     item.bbox = bbox
-    item.datetime = cftime_to_pdts(da["time"].item()).to_pydatetime()
-    # item.datetime = pd.Timestamp(da.coords[time_dim].item()).to_pydatetime()
+    item.datetime = pd.Timestamp(
+        da["time"].item().decode("utf-8")
+    ).to_pydatetime()  # dataset specific
+    # item.datetime = cftime_to_pdts(da["time"].item()).to_pydatetime() # dataset specific
 
     ext = pystac.extensions.projection.ProjectionExtension.ext(
         item, add_if_missing=True
@@ -168,8 +105,13 @@ def itemize(
     roles = asset_roles or ["data"]
 
     href = os.path.join(
-        GCS_PROTOCOL, BUCKET_NAME, BUCKET_PROJ, DATASET_FILENAME, blob_name
+        GCS_PROTOCOL,
+        BUCKET_NAME,
+        BUCKET_PROJ,
+        metadata["TITLE_ABBREVIATION"],
+        blob_name,
     )
+
     # TODO: We need to generalize this `href` somewhat.
     asset = pystac.Asset(
         href=href,
@@ -191,20 +133,31 @@ def itemize(
 
     STAC_DIR = "current"
     TEMPLATE_COLLECTION = "template"  # stac template for dataset collection
-    COLLECTION_ID = "slp"  # name of stac collection
-    COLLECTION_TITLE = "AR5 Sea level change projections"
-    DATASET_DESCRIPTION = "AR5 Sea level change projections"
+    COLLECTION_ID = "slp5"  # name of stac collection
 
     # hard-coded input params which differ per dataset
-    DATASET_FILENAME = "ar5_slp"
+    METADATA = "metadata_AR5_slp.json"
     DATASET_DIR = "18_AR5_SLP_IPCC"
+    CF_FILE = "total-ens-slr_CF.nc"
+
+    # these are added at collection level, determine dashboard graph layout using all items
+    PLOT_SERIES = "scenarios"
+    PLOT_X_AXIS = "time"
+    PLOT_TYPE = "line"
+    MIN = 0
+    MAX = 3
+    LINEAR_GRADIENT = [
+        {"color": "hsl(110,90%,80%)", "offset": "0.000%", "opacity": 100},
+        {"color": "hsla(55,88%,53%,0.5)", "offset": "50.000%", "opacity": 100},
+        {"color": "hsl(0,90%,70%)", "offset": "100.000%", "opacity": 100},
+    ]
 
     # define local directories
     home = pathlib.Path().home()
     tmp_dir = home.joinpath("data", "tmp")
-
-    # remote p drive
-    coclico_data_dir = p_drive.joinpath("11205479-coclico", "FASTTRACK_DATA")
+    coclico_data_dir = p_drive.joinpath(
+        "11205479-coclico", "FASTTRACK_DATA"
+    )  # remote p drive
 
     # use local or remote data dir
     use_local_data = False
@@ -217,9 +170,13 @@ def itemize(
     if not ds_dir.exists():
         raise FileNotFoundError(f"Data dir does not exist, {str(ds_dir)}")
 
-    # directory to export result (make if not exists)
-    cog_dir = ds_dir.joinpath("cogs")
-    cog_dir.mkdir(parents=True, exist_ok=True)
+    # directory to export result
+    cog_dirs = ds_dir.joinpath("cogs")
+
+    # load metadata template
+    metadata_fp = ds_dir.joinpath(METADATA)
+    with open(metadata_fp, "r") as f:
+        metadata = json.load(f)
 
     catalog = Catalog.from_file(os.path.join(rel_root, STAC_DIR, "catalog.json"))
 
@@ -231,39 +188,49 @@ def itemize(
     collection = get_template_collection(
         template_fp=template_fp,
         collection_id=COLLECTION_ID,
-        title=COLLECTION_TITLE,
-        description=DATASET_DESCRIPTION,
+        title=metadata["TITLE"],
+        description=metadata["SHORT_DESCRIPTION"],
+        keywords=metadata["KEYWORDS"],
+        # license=metadata["LICENSE"],
+        # spatial_extent=metadata["SPATIAL_EXTENT"],
+        # temporal_extent=metadata["TEMPORAL_EXTENT"],
+        # providers=metadata["PROVIDERS"],
     )
 
     layout = LayoutCoG()
 
-    # the dataset contains three different rcp scenario's
-    RCPS = ["26", "45", "85"]
-    for rcp in RCPS:
-        # load dataset and do some pre-processsing
-        ds_fp = ds_dir.joinpath(f"total-ens-slr-{rcp}-5.nc")
-        ds = xr.open_dataset(ds_fp)
+    # open the dataset
+    ds_fp = ds_dir.joinpath(CF_FILE)
+    ds = xr.open_dataset(ds_fp)
+
+    # loop over gdal / nc folder struct (max depth = 3)
+    # NOTE, does not work as we open the geotiff which contains to little information to itemize objects (need dataArrays)
+    # for scen_str in os.listdir(cog_dirs):  # search in the scenarios (the highest level)
+
+    #     for var_ens in os.listdir(
+    #         cog_dirs.joinpath(scen_str)
+    #     ):  # search in the variables / ensembles (the middle level)
 
-        # format dataset based upon notebooks/18_SLR_AR5.ipynb
-        ds = make_cf_compliant(ds)
-        # TODO: write CF compliancy output file?
+    #         for fname in os.listdir(
+    #             cog_dirs.joinpath(scen_str, var_ens)
+    #         ):  # search in the file name (the lowest level)
 
-        # # convert cf noleap yrs to datetime
-        # ds["time"] = ds.indexes["time"].to_datetimeindex()
+    #             blob_name = pathlib.Path(scen_str, var_ens, fname)
+    #             outpath = cog_dirs.joinpath(blob_name)
+    #             da = rio.open_rasterio(outpath, masked=True)
 
-        # add crs and spatial dims
-        ds.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
-        if not ds.rio.crs:
-            ds = ds.rio.write_crs("EPSG:4326")
+    # TODO: check what we can generalize and put into a function for temporal CoGs
+    # loop over CF compliant NC file, same code as in related .ipynb
+    for idx, scen in enumerate(ds["scenarios"].values):
+        rcp = scen.decode("utf-8")
 
         # format rcp name for filenaming
-        rcp_name = f"rcp={rcp}"
+        rcp_name = "rcp=%s" % rcp.strip("RCP")
+        print(rcp_name)
 
         # extract list of data variables
         variables = set(ds.variables) - set(ds.dims) - set(ds.coords)
 
-        ds["ensemble"] = ds.coords["ensemble"].astype(str)
-
         ntimes = ds.dims["time"]
         for ntime in range(ntimes):
             ds2 = ds.copy()
@@ -277,35 +244,57 @@ def itemize(
 
             for var_name in variables:
                 # time bounds are extracted, so nv dim can be dropped, as tiff should be 2D or 3D.
-                da = ds2[var_name]
-
-                # compose tif name
-                fname = time_name + ".tif"
-                blob_name = pathlib.Path(rcp_name, var_name, fname)
-                outpath = cog_dir.joinpath(blob_name)
-
-                # make parent dir if not exists
-                outpath.parent.mkdir(parents=True, exist_ok=True)
-
-                # add time name as scalar variable to tif
-                da["time_bnds"] = time_name
-
-                template_item = pystac.Item(
-                    "id", None, None, datetime.datetime(2000, 1, 1), {}
-                )
-
-                item = itemize(da, template_item, blob_name=str(blob_name))
-                collection.add_item(item, strategy=layout)
-
-                # set overwrite is false because tifs should be unique
-                try:
-                    write_cog(da, fname=outpath, overwrite=False)
-                except OSError as e:
-                    continue
+                da = ds2.sel({"nscenarios": idx})[var_name]
+
+                for idv, ens in enumerate(da["ensemble"].values):
+                    da2 = da.isel({"ensemble": idv})
+
+                    # add crs and spatial dims
+                    da2.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
+                    if not da2.rio.crs:
+                        da2 = da2.rio.write_crs("EPSG:4326")
+
+                    # reset some variables and attributes
+                    da2["time"] = np.array(
+                        cftime_to_pdts(da2["time"].item()).strftime(
+                            "%Y-%m-%d %H:%M:%S"
+                        ),
+                        dtype="S",
+                    )
+                    da2["time"].attrs["standard_name"] = "time"
+                    da2["time_bnds"] = np.array(time_name, dtype="S")
+                    da2["time_bnds"].attrs["long_name"] = "time boundaries"
+
+                    # compose tif name
+                    fname = time_name + ".tif"
+                    blob_name = pathlib.Path(
+                        rcp_name, var_name + "_ens%s" % int(ens), fname
+                    )
+                    outpath = cog_dirs.joinpath(blob_name)
+                    template_item = pystac.Item(
+                        "id", None, None, datetime.datetime(2000, 1, 1), {}
+                    )
+
+                    item = itemize(da2, template_item, blob_name=str(blob_name))
+                    collection.add_item(item, strategy=layout)
 
     # TODO: use gen_default_summaries() from blueprint.py after making it frontend compliant.
     collection.summaries = Summaries({})
 
+    # this calls CollectionCoclicoExtension since stac_obj==pystac.Collection
+    coclico_ext = CoclicoExtension.ext(collection, add_if_missing=True)
+
+    # Add frontend properties defined above to collection extension properties. The
+    # properties attribute of this extension is linked to the extra_fields attribute of
+    # the stac collection.
+    coclico_ext.units = metadata["UNITS"]
+    coclico_ext.plot_series = PLOT_SERIES
+    coclico_ext.plot_x_axis = PLOT_X_AXIS
+    coclico_ext.plot_type = PLOT_TYPE
+    coclico_ext.min_ = MIN
+    coclico_ext.max_ = MAX
+    coclico_ext.linear_gradient = LINEAR_GRADIENT
+
     # add collection to catalog
     catalog.add_child(collection)
 
@@ -329,9 +318,9 @@ def itemize(
     )
 
     dir_to_google_cloud(
-        dir_path=str(cog_dir),
+        dir_path=str(cog_dirs),
         gcs_project=GCS_PROJECT,
         bucket_name=BUCKET_NAME,
         bucket_proj=BUCKET_PROJ,
-        dir_name=DATASET_FILENAME,
+        dir_name=metadata["TITLE_ABBREVIATION"],
     )