diff --git a/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb b/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb index 971e812..3b44495 100644 --- a/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb +++ b/notebooks/wp5/lake_water_temperature_outlier_detection.ipynb @@ -1,13 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "48a773cc", - "metadata": {}, - "source": [ - "# Statistical analysis" - ] - }, { "cell_type": "markdown", "id": "e2cc23fd", @@ -16,25 +8,6 @@ "# Completeness of data series and outliers detection" ] }, - { - "cell_type": "markdown", - "id": "098c163c", - "metadata": {}, - "source": [ - "Use Case: Check completeness of lake water temperature time series for Great African Lakes and outliers detection.\n", - "\n", - "User Question: The satellite lakes water temperature dataset for Great African Lakes is complete in time? Are there some outliers?\n", - "\n", - "Methods:\n", - "\n", - "```\n", - " - Select Great African Lakes area and extract the mean water lakes temperature\n", - " - Plot the time series\n", - " - Calculate percentage of missing values\n", - " - Boxplot of the values and outliers detection\n", - "```" - ] - }, { "cell_type": "markdown", "id": "fa78226f", @@ -51,7 +24,10 @@ "outputs": [], "source": [ "import cartopy.crs as ccrs\n", + "import matplotlib.cbook\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import xarray as xr\n", "from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils\n", "\n", "plt.style.use(\"seaborn-v0_8-notebook\")" @@ -122,11 +98,23 @@ "metadata": {}, "outputs": [], "source": [ - "def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname):\n", - " ds = ds[[varname]]\n", + "def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname, lakeids):\n", + " ds = ds[[varname, \"lakeid\"]]\n", + " ds = ds.chunk({\"time\": 1, \"latitude\": 1_200, \"longitude\": 2_400})\n", " ds = utils.regionalise(ds, lon_slice=lon_slice, lat_slice=lat_slice)\n", - " ds = diagnostics.spatial_weighted_mean(ds)\n", - " return ds" + " dataarrays = []\n", + " for lakeid in lakeids:\n", + " da = ds[varname].where(ds[\"lakeid\"] == lakeid)\n", + " da = diagnostics.spatial_weighted_mean(da)\n", + " dataarrays.append(da.expand_dims(lakeid=[lakeid]))\n", + " return xr.concat(dataarrays, \"lakeid\").to_dataset()\n", + "\n", + "\n", + "def get_lakeid(ds, lon_slice, lat_slice):\n", + " da = ds[\"lakeid\"].isel(time=0)\n", + " da = da.chunk({\"latitude\": 1_200, \"longitude\": 2_400})\n", + " da = utils.regionalise(da, lon_slice=lon_slice, lat_slice=lat_slice)\n", + " return da.to_dataset()" ] }, { @@ -157,17 +145,24 @@ " \"lon_slice\": lon_slice,\n", " \"lat_slice\": lat_slice,\n", " \"varname\": varname,\n", + " \"lakeids\": [3, 7, 10],\n", " },\n", ")\n", - "da = ds[varname]" + "da = ds[varname].compute()" ] }, { "cell_type": "markdown", "id": "328f352f", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "source": [ - "## Extract lake id to plot a map of the region" + "## Extract lake IDs to plot a map of the region" ] }, { @@ -180,15 +175,16 @@ "# We use one of the request previously cached\n", "single_request = requests[0]\n", "single_request[\"month\"] = single_request[\"month\"][0]\n", - "ds_raw = download.download_and_transform(\n", + "da_lakeid = download.download_and_transform(\n", " collection_id,\n", " single_request,\n", " chunks=chunks,\n", - ")\n", - "\n", - "da_lakeid = utils.regionalise(\n", - " ds_raw[\"lakeid\"].isel(time=0), lon_slice=lon_slice, lat_slice=lat_slice\n", - ")" + " transform_func=get_lakeid,\n", + " transform_func_kwargs={\n", + " \"lon_slice\": lon_slice,\n", + " \"lat_slice\": lat_slice,\n", + " },\n", + ")[\"lakeid\"]" ] }, { @@ -196,7 +192,7 @@ "id": "674c3e27", "metadata": {}, "source": [ - "## Plot projected map" + "## Plot projected map of lake IDs" ] }, { @@ -206,7 +202,7 @@ "metadata": {}, "outputs": [], "source": [ - "_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree())" + "_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree(), show_stats=False)" ] }, { @@ -224,32 +220,23 @@ "metadata": {}, "outputs": [], "source": [ - "da.plot()\n", - "_ = plt.title(\"Spatial weighted mean\")" - ] - }, - { - "cell_type": "markdown", - "id": "e753ff96", - "metadata": {}, - "source": [ - "## Percentage of missing values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fec2d02c", - "metadata": {}, - "outputs": [], - "source": [ - "num_missing = float(da.isnull().sum() / da.size * 100)\n", - "print(f\"Number of missing values: {num_missing:.2f} %.\")" + "for lakeid, da_lakeid in da.groupby(\"lakeid\"):\n", + " da_lakeid.dropna(\"time\").plot(label=lakeid)\n", + "plt.legend()\n", + "plt.grid()\n", + "plt.title(\"Spatial weighted mean\")\n", + "plt.show()\n", + "\n", + "# Print missing values\n", + "missings = da.isnull().sum(\"time\") / da.sizes[\"time\"] * 100\n", + "id_digits = max(map(len, da[\"lakeid\"].astype(str).values))\n", + "for lakeid, missing in missings.groupby(\"lakeid\"):\n", + " print(f\"Missing values of lake ID {lakeid:<{id_digits}}: {missing.values:.2f} %\")" ] }, { "cell_type": "markdown", - "id": "055c8c64", + "id": "c46bdda8-fdd2-44fb-afc2-f3a4af6e98b6", "metadata": {}, "source": [ "## Boxplot" @@ -258,44 +245,21 @@ { "cell_type": "code", "execution_count": null, - "id": "b1656320", + "id": "4050019d-151d-4933-91af-38265578ae7b", "metadata": {}, "outputs": [], "source": [ - "# Create a boxplot\n", - "valid_da = da.where(da.notnull().compute(), drop=True).chunk(-1)\n", - "plt.boxplot(valid_da)\n", - "\n", - "# Add title and labels\n", - "# plt.title(\"Boxplot of array with missing values\")\n", - "plt.xlabel(\"Array\")\n", - "plt.ylabel(\"lake surface skin temperature\")\n", + "df = da.to_dataframe()\n", + "df.boxplot(by=\"lakeid\")\n", + "plt.show()\n", "\n", - "# Find 1st and 3rd quantile and median\n", - "da_qiles = valid_da.quantile([0.25, 0.5, 0.75])\n", - "\n", - "# Finding the IQR region\n", - "iqr = da_qiles.sel(quantile=0.75) - da_qiles.sel(quantile=0.25)\n", - "\n", - "# Finding upper and lower whiskers\n", - "stats = {\n", - " \"median\": float(da_qiles.sel(quantile=0.5)),\n", - " \"IQR upper bound\": float(da_qiles.sel(quantile=0.75) + (1.5 * iqr)),\n", - " \"IQR lower bound\": float(da_qiles.sel(quantile=0.25) - (1.5 * iqr)),\n", - " \"minimum\": float(da.min()),\n", - " \"maximum\": float(da.max()),\n", - "}\n", - "\n", - "# Print stats\n", - "for key, value in stats.items():\n", - " print(f\"The {key} value is {value:.2f} {valid_da.units}\")\n", - "\n", - "# Check outliers\n", - "no_outliers = (\n", - " stats[\"minimum\"] >= stats[\"IQR lower bound\"]\n", - " and stats[\"maximum\"] <= stats[\"IQR upper bound\"]\n", - ")\n", - "print(f\"\\nThere are {'NO' if no_outliers else 'SOME'} outliers in the series.\")" + "# Print statistics\n", + "boxplot_stats = {}\n", + "for lakeid, df_lakeid in df.groupby(\"lakeid\"):\n", + " values = df_lakeid.dropna().values.squeeze()\n", + " (boxplot_stats[lakeid],) = matplotlib.cbook.boxplot_stats(values)\n", + "boxplot_stats = pd.DataFrame(boxplot_stats)\n", + "boxplot_stats" ] } ], @@ -315,7 +279,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.10.12" } }, "nbformat": 4,