ocean-transport · jbusecke · Jun 24, 2024 · Jun 24, 2024
diff --git a/.github/workflows/build_image.yaml b/.github/workflows/build_image.yaml
@@ -6,7 +6,7 @@ on:
       - main
     paths: # only run on changes to the Dockerfile
       - 'Dockerfile'
-    
+
 
 jobs:
   build-and-push:

diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
-FROM quay.io/pangeo/pangeo-notebook:2023.10.24 
+FROM quay.io/pangeo/pangeo-notebook:2023.10.24
 LABEL maintainer="Julius Busecked"
 LABEL repo="https://github.com/ocean-transport/scale-aware-air-sea"
 
 
-RUN mamba install -n=notebook aerobulk-python -y 
+RUN mamba install -n=notebook aerobulk-python -y
 RUN pip install coiled
diff --git a/README.md b/README.md
@@ -17,4 +17,4 @@ pip install .
 Follow the above instructions but install the package via
 ```
 pip install -e ".[dev]"
-```
+```
diff --git a/ci/environment.yaml b/ci/environment.yaml
@@ -10,6 +10,7 @@ dependencies:
   - fsspec
   - gcsfs
   - intake-esm
+  - intake-xarray
   - numpy
   - pip
   - pytest

diff --git a/notebooks/jbusecke/new-workspace.jupyterlab-workspace b/notebooks/jbusecke/new-workspace.jupyterlab-workspace
@@ -1 +1 @@
-{"data":{"layout-restorer:data":{"main":{"dock":{"type":"split-area","orientation":"horizontal","sizes":[0.5,0.5],"children":[{"type":"tab-area","currentIndex":2,"widgets":["notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/aerobulk-python_performance.ipynb","terminal:1","notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/cm26_pipeline.ipynb","notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/reproduce_cm26_flux.ipynb","notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/testing/cm26_xesmf_hack.ipynb"]},{"type":"tab-area","currentIndex":3,"widgets":["dask-dashboard-launcher:/individual-progress","dask-dashboard-launcher:/individual-workers","dask-dashboard-launcher:/individual-task-stream","dask-dashboard-launcher:/individual-workers-memory"]}]},"current":"dask-dashboard-launcher:/individual-workers-memory"},"down":{"size":0,"widgets":[]},"left":{"collapsed":true,"widgets":["filebrowser","running-sessions","dask-dashboard-launcher","git-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0,1,0]},"file-browser-filebrowser:cwd":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke"},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/aerobulk-python_performance.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/aerobulk-python_performance.ipynb","factory":"Notebook"}},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/cm26_pipeline.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/cm26_pipeline.ipynb","factory":"Notebook"}},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/reproduce_cm26_flux.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/reproduce_cm26_flux.ipynb","factory":"Notebook"}},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/testing/cm26_xesmf_hack.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/testing/cm26_xesmf_hack.ipynb","factory":"Notebook"}},"terminal:1":{"data":{"name":"1"}},"dask-dashboard-launcher":{"url":"https://us-central1-b.gcp.pangeo.io/services/dask-gateway/clusters/prod.1e69eadfa71b4df0842dbaaeb5c7b01d/","cluster":""},"dask-dashboard-launcher:/individual-progress":{"data":{"route":"/individual-progress","label":"Progress","key":"Progress"}},"dask-dashboard-launcher:/individual-workers":{"data":{"route":"/individual-workers","label":"Workers","key":"Workers"}},"dask-dashboard-launcher:/individual-task-stream":{"data":{"route":"/individual-task-stream","label":"Task Stream","key":"Task Stream"}},"dask-dashboard-launcher:/individual-workers-memory":{"data":{"route":"/individual-workers-memory","label":"Workers Memory","key":"Workers Memory"}}},"metadata":{"id":"new-workspace","last_modified":"2022-06-23T17:40:03.102459+00:00","created":"2022-06-23T17:40:03.102459+00:00"}}
+{"data":{"layout-restorer:data":{"main":{"dock":{"type":"split-area","orientation":"horizontal","sizes":[0.5,0.5],"children":[{"type":"tab-area","currentIndex":2,"widgets":["notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/aerobulk-python_performance.ipynb","terminal:1","notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/cm26_pipeline.ipynb","notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/reproduce_cm26_flux.ipynb","notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/testing/cm26_xesmf_hack.ipynb"]},{"type":"tab-area","currentIndex":3,"widgets":["dask-dashboard-launcher:/individual-progress","dask-dashboard-launcher:/individual-workers","dask-dashboard-launcher:/individual-task-stream","dask-dashboard-launcher:/individual-workers-memory"]}]},"current":"dask-dashboard-launcher:/individual-workers-memory"},"down":{"size":0,"widgets":[]},"left":{"collapsed":true,"widgets":["filebrowser","running-sessions","dask-dashboard-launcher","git-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0,1,0]},"file-browser-filebrowser:cwd":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke"},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/aerobulk-python_performance.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/aerobulk-python_performance.ipynb","factory":"Notebook"}},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/cm26_pipeline.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/cm26_pipeline.ipynb","factory":"Notebook"}},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/reproduce_cm26_flux.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/workflow/reproduce_cm26_flux.ipynb","factory":"Notebook"}},"notebook:1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/testing/cm26_xesmf_hack.ipynb":{"data":{"path":"1_PROJECTS/scale-aware-air-sea/notebooks/jbusecke/testing/cm26_xesmf_hack.ipynb","factory":"Notebook"}},"terminal:1":{"data":{"name":"1"}},"dask-dashboard-launcher":{"url":"https://us-central1-b.gcp.pangeo.io/services/dask-gateway/clusters/prod.1e69eadfa71b4df0842dbaaeb5c7b01d/","cluster":""},"dask-dashboard-launcher:/individual-progress":{"data":{"route":"/individual-progress","label":"Progress","key":"Progress"}},"dask-dashboard-launcher:/individual-workers":{"data":{"route":"/individual-workers","label":"Workers","key":"Workers"}},"dask-dashboard-launcher:/individual-task-stream":{"data":{"route":"/individual-task-stream","label":"Task Stream","key":"Task Stream"}},"dask-dashboard-launcher:/individual-workers-memory":{"data":{"route":"/individual-workers-memory","label":"Workers Memory","key":"Workers Memory"}}},"metadata":{"id":"new-workspace","last_modified":"2022-06-23T17:40:03.102459+00:00","created":"2022-06-23T17:40:03.102459+00:00"}}
diff --git a/notebooks/jbusecke/testing/xesmf_test.py b/notebooks/jbusecke/testing/xesmf_test.py
@@ -12,4 +12,3 @@
 ds_atmos = xr.open_zarr('gs://cmip6/GFDL_CM2_6/control/atmos_daily.zarr', **kwargs)
 
 regridder = xe.Regridder(ds_atmos.olr.isel(time=0), ds.surface_temp.isel(time=0), 'bilinear', periodic=True) # all the atmos data is on the cell center AFAIK
-
diff --git a/notebooks/jbusecke/workflow/cm26_pipeline-debug.py b/notebooks/jbusecke/workflow/cm26_pipeline-debug.py
@@ -2,7 +2,7 @@
 # coding: utf-8
 
 # # Debug issues with exceeding wind stress
-# 
+#
 # ## Tasks
 # - Find timestep(s) that exhibit the behavior
 # - Confirm that this behavior only affects certain algorithms
@@ -31,7 +31,7 @@
 from dask.diagnostics import ProgressBar
 from cm26_utils import write_split_zarr, noskin_ds_wrapper, load_and_merge_cm26
 
-# 👇 replace with your key 
+# 👇 replace with your key
 with open('/home/jovyan/keys/pangeo-forge-ocean-transport-4967-347e2048c5a1.json') as token_file:
     token = json.load(token_file)
 
@@ -63,7 +63,7 @@
 # I did `jupyter nbconvert --to python cm26_pipeline-debug.ipynb`
 # and then I get the error about the wind stress.
 
-# I have executed this with all algos and I get crashes for: 
+# I have executed this with all algos and I get crashes for:
 # 'coare3p6'
 # 'andreas'
 # 'coare3p0'
@@ -87,7 +87,7 @@
 
 
 # ## Investigate the max windstress values we are getting with the working algos
-# 
+#
 # Is there a correlation betweewn max wind speeds and stresses? Yeah definitely!
 
 # In[ ]:
@@ -118,8 +118,8 @@
 
 # ## Ok can we actually get around this and get some results at all?
 # If not we need to raise the tau cut of in aerobulk.
-# 
-# My simple approach right here is to set every wind value larger than `threshold` to zero. This is not a feasible solution for our processing, but I just want to see how low we have to go to get all algos to go through! 
+#
+# My simple approach right here is to set every wind value larger than `threshold` to zero. This is not a feasible solution for our processing, but I just want to see how low we have to go to get all algos to go through!
 
 # In[ ]:
 
@@ -133,7 +133,7 @@
     mask = ds_masked.wind>threshold
     ds_masked['u_ref'] = ds_masked['u_ref'].where(mask, 0)
     ds_masked['v_ref'] = ds_masked['v_ref'].where(mask, 0)
-    
+
     break
     ds_out = noskin_ds_wrapper(ds_merged, algo=algo, input_range_check=False)
     with ProgressBar():
@@ -143,4 +143,3 @@
     stress_max = stress.max(['xt_ocean', 'yt_ocean']).assign_coords(algo=algo)
     print(stress_max)
     datasets.append(stress_max)
-
diff --git a/notebooks/jbusecke/workflow/cm26_pipeline.py b/notebooks/jbusecke/workflow/cm26_pipeline.py
@@ -18,7 +18,7 @@
 from dask.diagnostics import ProgressBar
 from cm26_utils import write_split_zarr, noskin_ds_wrapper
 
-# 👇 replace with your key 
+# 👇 replace with your key
 with open('/home/jovyan/keys/pangeo-forge-ocean-transport-4967-347e2048c5a1.json') as token_file:
     token = json.load(token_file)
 fs = gcsfs.GCSFileSystem(token=token)
@@ -87,7 +87,7 @@
 
 ############################# wind cutting ##############
 # ok so lets not add nans into fields like above. Instead, lets see in which timesteps this actually occurs and for noe completely ignore these timesteps
-# This is not ideal in the long run, but maybe at least gives us a way to output 
+# This is not ideal in the long run, but maybe at least gives us a way to output
 
 # ds_cut = ds_merged.isel(time=slice(0,500))
 # wind = ds_cut.wind
@@ -96,7 +96,7 @@
 threshold = 30
 with ProgressBar():
     strong_wind_cells = (wind > threshold).sum(['xt_ocean','yt_ocean']).load()
-    
+
 strong_wind_index = strong_wind_cells > 0
 
 # double check that these events are still rare in space and time
@@ -121,6 +121,6 @@
 if fs.exists(path) and overwrite:
 # # # delete the mapper (only uncomment if you want to start from scratch!)
     print("DELETE existing store")
-    fs.rm(path, recursive=True)    
+    fs.rm(path, recursive=True)
 
-write_split_zarr(mapper, ds_out, split_interval=64)
+write_split_zarr(mapper, ds_out, split_interval=64)
diff --git a/notebooks/jbusecke/workflow/cm26_utils.py b/notebooks/jbusecke/workflow/cm26_utils.py
@@ -14,37 +14,37 @@
 #     - Adjust units for aerobulk input
 #     - Calculate relative wind components
 #     """
-#     kwargs = dict(consolidated=True, use_cftime=True, inline_array=inline_array, engine='zarr')#, 
+#     kwargs = dict(consolidated=True, use_cftime=True, inline_array=inline_array, engine='zarr')#,
 #     print('Load Data')
 #     mapper = filesystem.get_mapper("gs://cmip6/GFDL_CM2_6/control/surface")
 #     # ds_ocean = xr.open_dataset(mapper, chunks='auto', **kwargs)
 #     # ds_ocean = xr.open_dataset(mapper, chunks={'time':2}, **kwargs)
 #     ds_ocean = xr.open_dataset(mapper, chunks={'time':3}, **kwargs)
 #     # cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/ocean/GFDL_CM2.6.yaml")
 #     # ds_ocean  = cat["GFDL_CM2_6_control_ocean_surface"].to_dask()
-    
+
 #     # ds_flux  = cat["GFDL_CM2_6_control_ocean_boundary_flux"].to_dask()
 #     mapper = filesystem.get_mapper("gs://cmip6/GFDL_CM2_6/control/ocean_boundary")
 #     # ds_flux = xr.open_dataset(mapper, chunks='auto', **kwargs)
 #     # ds_flux = xr.open_dataset(mapper, chunks={'time':2}, **kwargs)
 #     ds_flux = xr.open_dataset(mapper, chunks={'time':3}, **kwargs)
-    
-        
+
+
 #     # xarray says not to do this
 #     # ds_atmos = xr.open_zarr('gs://cmip6/GFDL_CM2_6/control/atmos_daily.zarr', chunks={'time':1}, **kwargs)
 #     mapper = filesystem.get_mapper("gs://cmip6/GFDL_CM2_6/control/atmos_daily.zarr")
 #     # ds_atmos = xr.open_dataset(mapper, chunks={'time':120}, **kwargs)
 #     # ds_atmos = xr.open_dataset(mapper, chunks={'time':2}, **kwargs)
 #     # ds_atmos = xr.open_dataset(mapper, chunks={'time':3}, **kwargs)
 #     ds_atmos = xr.open_dataset(mapper, chunks={'time':120}, **kwargs).chunk({'time':3})
-    
+
 #     # # instead do this
 #     # ds_atmos = ds_atmos.chunk({'time':1})
-    
+
 #     mapper = filesystem.get_mapper("gs://cmip6/GFDL_CM2_6/grid")
 #     ds_oc_grid = xr.open_dataset(mapper, chunks={}, **kwargs)
 #     # ds_oc_grid  = cat["GFDL_CM2_6_grid"].to_dask()
-    
+
 #     print('Align in time')
 #     # cut to same time
 #     all_dims = set(list(ds_ocean.dims)+list(ds_atmos.dims))
@@ -78,7 +78,7 @@
 #     mapper = filesystem.get_mapper(path)
 #     ds_regridder = xr.open_zarr(mapper).load()
 #     regridder = xe.Regridder(
-#         ds_atmos.olr.to_dataset(name='dummy').isel(time=0).reset_coords(drop=True),# this is the same dumb problem I keep having with 
+#         ds_atmos.olr.to_dataset(name='dummy').isel(time=0).reset_coords(drop=True),# this is the same dumb problem I keep having with
 #         ds_ocean.surface_temp.to_dataset(name='dummy').isel(time=0).reset_coords(drop=True),
 #         'bilinear',
 #         weights=ds_regridder,
@@ -100,17 +100,17 @@
 #     # fix units for aerobulk
 #     ds_merged['surface_temp'] = ds_merged['surface_temp'] + 273.15
 #     ds_merged['slp'] = ds_merged['slp'] * 100 # check this
-    
+
 #     print('Mask nans')
 #     # atmos missing values are filled with 0s, which causes issues with the filtering
 #     # Ideally this should be masked before the regridding, but xesmf fills with 0 again...
 #     mask = ~np.isnan(ds_merged['surface_temp'])
 #     for mask_var in ['slp', 't_ref', 'q_ref']:
 #         ds_merged[mask_var] = ds_merged[mask_var].where(mask)
-    
+
 #     # Calculate relative wind
 #     print('Calculate relative wind')
 #     ds_merged['u_relative'] = ds_merged['u_ref'] - ds_merged['u_ocean']
 #     ds_merged['v_relative'] = ds_merged['v_ref'] - ds_merged['v_ocean']
-    
-#     return ds_merged
+
+#     return ds_merged
diff --git a/notebooks/jbusecke/workflow/cm26_utils_old.py b/notebooks/jbusecke/workflow/cm26_utils_old.py
@@ -11,15 +11,15 @@ def write_split_zarr(store, ds, split_dim='time', chunks=1, split_interval=180):
     This can be helpful to e.g. avoid problems with overly eager dask schedulers
     """
     # Got my inspiration for this mostly here: https://github.com/pydata/xarray/issues/6069
-    
+
     # determine the variables and coordinates that depend on the split_dim
     other_dims = [di for di in ds.dims if di != split_dim]
     split_vars_coords = [va for va in ds.variables if split_dim in ds[va].dims and va not in ds.dims]
     non_split_vars_coords = [va for va in ds.variables if va not in split_vars_coords and va not in ds.dims]
-    
+
     # Generate a stripped dataset that only contains variables/coordinates that do not depend on `split_dim`
     ds_stripped = ds.drop_vars(split_vars_coords+[split_dim])
-        
+
     # initialize the store without writing values
     print('initializing store')
     ds.to_zarr(
@@ -28,45 +28,45 @@ def write_split_zarr(store, ds, split_dim='time', chunks=1, split_interval=180):
         encoding={split_dim:{"chunks":[chunks]}},
         consolidated=True, # TODO: Not sure if this is proper. Might have to consolidate the whole thing as a last step?
     )
-    
+
     # Write out only the non-split variables/coordinates
     if len(non_split_vars_coords) > 0:
         print('Writing coordinates')
         ds_stripped.to_zarr(store, mode='a') #I guess a is 'add'. This is honestly not clear enough in the xarray docs.
-        # with `w` there are issues with the shape. 
-    
+        # with `w` there are issues with the shape.
+
     # TODO: what about the attrs?
-    
+
     # Populate split chunks as regions
     n = len(ds[split_dim])
     splits = list(range(0,n,split_interval))
 
     # Make sure the last item in the list covers the full length of the time on our dataset
     if splits[-1] != n:
         splits = splits + [n]
-    
+
     for ii in tqdm(range(len(splits)-1)):
         print(f'Writing split {ii}')
         # TODO: put some retry logic in here...
         start = splits[ii]
         stop = splits[ii+1]
-        
+
         ds_write = ds.isel({split_dim:slice(start, stop)})
         print(f'Start: {ds_write[split_dim][0].data}')
         print(f'Stop: {ds_write[split_dim][-1].data}')
-        
+
         # strip everything except the values
         drop_vars = non_split_vars_coords+other_dims
         ds_write = ds_write.drop_vars(drop_vars)
-        
+
         with ProgressBar():
             ds_write.to_zarr(store, region={split_dim:slice(start, stop)}, mode='a')#why are the variables not instantiated in the init step
-            
-# TODO: This is model agnostic and should live somewhere else?         
+
+# TODO: This is model agnostic and should live somewhere else?
 def noskin_ds_wrapper(ds_in, algo='ecmwf', **kwargs):
     ds_out = xr.Dataset()
     ds_in = ds_in.copy(deep=False)
-    
+
     sst = ds_in.surface_temp + 273.15
     t_zt = ds_in.t_ref
     hum_zt = ds_in.q_ref
@@ -75,7 +75,7 @@ def noskin_ds_wrapper(ds_in, algo='ecmwf', **kwargs):
     slp = ds_in.slp * 100 # check this
     zu = 10
     zt = 2
-    
+
     ql, qh, taux, tauy, evap =  noskin(
         sst,
         t_zt,
@@ -91,7 +91,7 @@ def noskin_ds_wrapper(ds_in, algo='ecmwf', **kwargs):
     ds_out['ql'] = ql
     ds_out['qh'] = qh
     ds_out['evap'] = evap
-    ds_out['taux'] = taux 
+    ds_out['taux'] = taux
     ds_out['tauy'] = tauy
     return ds_out
 
@@ -114,7 +114,7 @@ def load_and_merge_cm26(regridder_token):
     )
     # instead do this
     ds_atmos = ds_atmos.chunk({'time':1})
-    
+
     fs = gcsfs.GCSFileSystem(token=regridder_token)
     path = 'ocean-transport-group/scale-aware-air-sea/regridding_weights/CM26_atmos2ocean.zarr'
     mapper = fs.get_mapper(path)
@@ -138,4 +138,4 @@ def load_and_merge_cm26(regridder_token):
     ds_merged = ds_merged.transpose(
         'xt_ocean', 'yt_ocean', 'time'
     )
-    return ds_merged
+    return ds_merged
diff --git a/notebooks/jbusecke/workflow/debugging_script.py b/notebooks/jbusecke/workflow/debugging_script.py
@@ -4,7 +4,7 @@
 def noskin_ds_wrapper(ds_in):
     ds_out = xr.Dataset()
     ds_in = ds_in.copy(deep=False)
-    
+
     sst = ds_in.surface_temp + 273.15
     t_zt = ds_in.t_ref
     hum_zt = ds_in.q_ref
@@ -13,7 +13,7 @@ def noskin_ds_wrapper(ds_in):
     slp = ds_in.slp * 100 # check this
     zu = 10
     zt = 2
-    
+
     ql, qh, taux, tauy, evap =  noskin(
         sst,
         t_zt,
@@ -28,10 +28,10 @@ def noskin_ds_wrapper(ds_in):
     ds_out['ql'] = ql
     ds_out['qh'] = qh
     ds_out['evap'] = evap
-    ds_out['taux'] = taux 
+    ds_out['taux'] = taux
     ds_out['tauy'] = tauy
     return ds_out
 
 # load the tempsave file from `cm26_pipeline.ipynb`
 ds_coarsened = xr.open_dataset('test_coarsened_filled.nc')
-ds_coarse_res = noskin_ds_wrapper(ds_coarsened)
+ds_coarse_res = noskin_ds_wrapper(ds_coarsened)