review lake

bopen · Jul 20, 2023 · a7ec6f3 · a7ec6f3
1 parent eeb8883
commit a7ec6f3
Showing 1 changed file with 63 additions and 99 deletions.
diff --git a/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb b/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "48a773cc",
-   "metadata": {},
-   "source": [
-    "# Statistical analysis"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "e2cc23fd",
@@ -16,25 +8,6 @@
     "# Completeness of data series and outliers detection"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "098c163c",
-   "metadata": {},
-   "source": [
-    "Use Case: Check completeness of lake water temperature time series for Great African Lakes and outliers detection.\n",
-    "\n",
-    "User Question: The satellite lakes water temperature dataset for Great African Lakes is complete in time? Are there some outliers?\n",
-    "\n",
-    "Methods:\n",
-    "\n",
-    "```\n",
-    "    - Select Great African Lakes area and extract the mean water lakes temperature\n",
-    "    - Plot the time series\n",
-    "    - Calculate percentage of missing values\n",
-    "    - Boxplot of the values and outliers detection\n",
-    "```"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "fa78226f",
@@ -51,7 +24,10 @@
    "outputs": [],
    "source": [
     "import cartopy.crs as ccrs\n",
+    "import matplotlib.cbook\n",
     "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import xarray as xr\n",
     "from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils\n",
     "\n",
     "plt.style.use(\"seaborn-v0_8-notebook\")"
@@ -122,11 +98,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname):\n",
-    "    ds = ds[[varname]]\n",
+    "def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname, lakeids):\n",
+    "    ds = ds[[varname, \"lakeid\"]]\n",
+    "    ds = ds.chunk({\"time\": 1, \"latitude\": 1_200, \"longitude\": 2_400})\n",
     "    ds = utils.regionalise(ds, lon_slice=lon_slice, lat_slice=lat_slice)\n",
-    "    ds = diagnostics.spatial_weighted_mean(ds)\n",
-    "    return ds"
+    "    dataarrays = []\n",
+    "    for lakeid in lakeids:\n",
+    "        da = ds[varname].where(ds[\"lakeid\"] == lakeid)\n",
+    "        da = diagnostics.spatial_weighted_mean(da)\n",
+    "        dataarrays.append(da.expand_dims(lakeid=[lakeid]))\n",
+    "    return xr.concat(dataarrays, \"lakeid\").to_dataset()\n",
+    "\n",
+    "\n",
+    "def get_lakeid(ds, lon_slice, lat_slice):\n",
+    "    da = ds[\"lakeid\"].isel(time=0)\n",
+    "    da = da.chunk({\"latitude\": 1_200, \"longitude\": 2_400})\n",
+    "    da = utils.regionalise(da, lon_slice=lon_slice, lat_slice=lat_slice)\n",
+    "    return da.to_dataset()"
    ]
   },
   {
@@ -157,17 +145,24 @@
     "        \"lon_slice\": lon_slice,\n",
     "        \"lat_slice\": lat_slice,\n",
     "        \"varname\": varname,\n",
+    "        \"lakeids\": [3, 7, 10],\n",
     "    },\n",
     ")\n",
-    "da = ds[varname]"
+    "da = ds[varname].compute()"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "328f352f",
-   "metadata": {},
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
    "source": [
-    "## Extract lake id to plot a map of the region"
+    "## Extract lake IDs to plot a map of the region"
    ]
   },
   {
@@ -180,23 +175,24 @@
     "# We use one of the request previously cached\n",
     "single_request = requests[0]\n",
     "single_request[\"month\"] = single_request[\"month\"][0]\n",
-    "ds_raw = download.download_and_transform(\n",
+    "da_lakeid = download.download_and_transform(\n",
     "    collection_id,\n",
     "    single_request,\n",
     "    chunks=chunks,\n",
-    ")\n",
-    "\n",
-    "da_lakeid = utils.regionalise(\n",
-    "    ds_raw[\"lakeid\"].isel(time=0), lon_slice=lon_slice, lat_slice=lat_slice\n",
-    ")"
+    "    transform_func=get_lakeid,\n",
+    "    transform_func_kwargs={\n",
+    "        \"lon_slice\": lon_slice,\n",
+    "        \"lat_slice\": lat_slice,\n",
+    "    },\n",
+    ")[\"lakeid\"]"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "674c3e27",
    "metadata": {},
    "source": [
-    "## Plot projected map"
+    "## Plot projected map of lake IDs"
    ]
   },
   {
@@ -206,7 +202,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree())"
+    "_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree(), show_stats=False)"
    ]
   },
   {
@@ -224,32 +220,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "da.plot()\n",
-    "_ = plt.title(\"Spatial weighted mean\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e753ff96",
-   "metadata": {},
-   "source": [
-    "## Percentage of missing values"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fec2d02c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "num_missing = float(da.isnull().sum() / da.size * 100)\n",
-    "print(f\"Number of missing values: {num_missing:.2f} %.\")"
+    "for lakeid, da_lakeid in da.groupby(\"lakeid\"):\n",
+    "    da_lakeid.dropna(\"time\").plot(label=lakeid)\n",
+    "plt.legend()\n",
+    "plt.grid()\n",
+    "plt.title(\"Spatial weighted mean\")\n",
+    "plt.show()\n",
+    "\n",
+    "# Print missing values\n",
+    "missings = da.isnull().sum(\"time\") / da.sizes[\"time\"] * 100\n",
+    "id_digits = max(map(len, da[\"lakeid\"].astype(str).values))\n",
+    "for lakeid, missing in missings.groupby(\"lakeid\"):\n",
+    "    print(f\"Missing values of lake ID {lakeid:<{id_digits}}: {missing.values:.2f} %\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "055c8c64",
+   "id": "c46bdda8-fdd2-44fb-afc2-f3a4af6e98b6",
    "metadata": {},
    "source": [
     "## Boxplot"
@@ -258,44 +245,21 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b1656320",
+   "id": "4050019d-151d-4933-91af-38265578ae7b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create a boxplot\n",
-    "valid_da = da.where(da.notnull().compute(), drop=True).chunk(-1)\n",
-    "plt.boxplot(valid_da)\n",
-    "\n",
-    "# Add title and labels\n",
-    "# plt.title(\"Boxplot of array with missing values\")\n",
-    "plt.xlabel(\"Array\")\n",
-    "plt.ylabel(\"lake surface skin temperature\")\n",
+    "df = da.to_dataframe()\n",
+    "df.boxplot(by=\"lakeid\")\n",
+    "plt.show()\n",
     "\n",
-    "# Find 1st and 3rd quantile and median\n",
-    "da_qiles = valid_da.quantile([0.25, 0.5, 0.75])\n",
-    "\n",
-    "# Finding the IQR region\n",
-    "iqr = da_qiles.sel(quantile=0.75) - da_qiles.sel(quantile=0.25)\n",
-    "\n",
-    "# Finding upper and lower whiskers\n",
-    "stats = {\n",
-    "    \"median\": float(da_qiles.sel(quantile=0.5)),\n",
-    "    \"IQR upper bound\": float(da_qiles.sel(quantile=0.75) + (1.5 * iqr)),\n",
-    "    \"IQR lower bound\": float(da_qiles.sel(quantile=0.25) - (1.5 * iqr)),\n",
-    "    \"minimum\": float(da.min()),\n",
-    "    \"maximum\": float(da.max()),\n",
-    "}\n",
-    "\n",
-    "# Print stats\n",
-    "for key, value in stats.items():\n",
-    "    print(f\"The {key} value is {value:.2f} {valid_da.units}\")\n",
-    "\n",
-    "# Check outliers\n",
-    "no_outliers = (\n",
-    "    stats[\"minimum\"] >= stats[\"IQR lower bound\"]\n",
-    "    and stats[\"maximum\"] <= stats[\"IQR upper bound\"]\n",
-    ")\n",
-    "print(f\"\\nThere are {'NO' if no_outliers else 'SOME'} outliers in the series.\")"
+    "# Print statistics\n",
+    "boxplot_stats = {}\n",
+    "for lakeid, df_lakeid in df.groupby(\"lakeid\"):\n",
+    "    values = df_lakeid.dropna().values.squeeze()\n",
+    "    (boxplot_stats[lakeid],) = matplotlib.cbook.boxplot_stats(values)\n",
+    "boxplot_stats = pd.DataFrame(boxplot_stats)\n",
+    "boxplot_stats"
    ]
   }
  ],
@@ -315,7 +279,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,