Skip to content

Commit

Permalink
review lake
Browse files Browse the repository at this point in the history
  • Loading branch information
malmans2 committed Jul 20, 2023
1 parent eeb8883 commit a7ec6f3
Showing 1 changed file with 63 additions and 99 deletions.
162 changes: 63 additions & 99 deletions notebooks/wp5/lake_water_temperature_outlier_detection.ipynb
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "48a773cc",
"metadata": {},
"source": [
"# Statistical analysis"
]
},
{
"cell_type": "markdown",
"id": "e2cc23fd",
Expand All @@ -16,25 +8,6 @@
"# Completeness of data series and outliers detection"
]
},
{
"cell_type": "markdown",
"id": "098c163c",
"metadata": {},
"source": [
"Use Case: Check completeness of lake water temperature time series for Great African Lakes and outliers detection.\n",
"\n",
"User Question: The satellite lakes water temperature dataset for Great African Lakes is complete in time? Are there some outliers?\n",
"\n",
"Methods:\n",
"\n",
"```\n",
" - Select Great African Lakes area and extract the mean water lakes temperature\n",
" - Plot the time series\n",
" - Calculate percentage of missing values\n",
" - Boxplot of the values and outliers detection\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "fa78226f",
Expand All @@ -51,7 +24,10 @@
"outputs": [],
"source": [
"import cartopy.crs as ccrs\n",
"import matplotlib.cbook\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import xarray as xr\n",
"from c3s_eqc_automatic_quality_control import diagnostics, download, plot, utils\n",
"\n",
"plt.style.use(\"seaborn-v0_8-notebook\")"
Expand Down Expand Up @@ -122,11 +98,23 @@
"metadata": {},
"outputs": [],
"source": [
"def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname):\n",
" ds = ds[[varname]]\n",
"def spatial_weighted_mean_of_region(ds, lon_slice, lat_slice, varname, lakeids):\n",
" ds = ds[[varname, \"lakeid\"]]\n",
" ds = ds.chunk({\"time\": 1, \"latitude\": 1_200, \"longitude\": 2_400})\n",
" ds = utils.regionalise(ds, lon_slice=lon_slice, lat_slice=lat_slice)\n",
" ds = diagnostics.spatial_weighted_mean(ds)\n",
" return ds"
" dataarrays = []\n",
" for lakeid in lakeids:\n",
" da = ds[varname].where(ds[\"lakeid\"] == lakeid)\n",
" da = diagnostics.spatial_weighted_mean(da)\n",
" dataarrays.append(da.expand_dims(lakeid=[lakeid]))\n",
" return xr.concat(dataarrays, \"lakeid\").to_dataset()\n",
"\n",
"\n",
"def get_lakeid(ds, lon_slice, lat_slice):\n",
" da = ds[\"lakeid\"].isel(time=0)\n",
" da = da.chunk({\"latitude\": 1_200, \"longitude\": 2_400})\n",
" da = utils.regionalise(da, lon_slice=lon_slice, lat_slice=lat_slice)\n",
" return da.to_dataset()"
]
},
{
Expand Down Expand Up @@ -157,17 +145,24 @@
" \"lon_slice\": lon_slice,\n",
" \"lat_slice\": lat_slice,\n",
" \"varname\": varname,\n",
" \"lakeids\": [3, 7, 10],\n",
" },\n",
")\n",
"da = ds[varname]"
"da = ds[varname].compute()"
]
},
{
"cell_type": "markdown",
"id": "328f352f",
"metadata": {},
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## Extract lake id to plot a map of the region"
"## Extract lake IDs to plot a map of the region"
]
},
{
Expand All @@ -180,23 +175,24 @@
"# We use one of the request previously cached\n",
"single_request = requests[0]\n",
"single_request[\"month\"] = single_request[\"month\"][0]\n",
"ds_raw = download.download_and_transform(\n",
"da_lakeid = download.download_and_transform(\n",
" collection_id,\n",
" single_request,\n",
" chunks=chunks,\n",
")\n",
"\n",
"da_lakeid = utils.regionalise(\n",
" ds_raw[\"lakeid\"].isel(time=0), lon_slice=lon_slice, lat_slice=lat_slice\n",
")"
" transform_func=get_lakeid,\n",
" transform_func_kwargs={\n",
" \"lon_slice\": lon_slice,\n",
" \"lat_slice\": lat_slice,\n",
" },\n",
")[\"lakeid\"]"
]
},
{
"cell_type": "markdown",
"id": "674c3e27",
"metadata": {},
"source": [
"## Plot projected map"
"## Plot projected map of lake IDs"
]
},
{
Expand All @@ -206,7 +202,7 @@
"metadata": {},
"outputs": [],
"source": [
"_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree())"
"_ = plot.projected_map(da_lakeid, projection=ccrs.PlateCarree(), show_stats=False)"
]
},
{
Expand All @@ -224,32 +220,23 @@
"metadata": {},
"outputs": [],
"source": [
"da.plot()\n",
"_ = plt.title(\"Spatial weighted mean\")"
]
},
{
"cell_type": "markdown",
"id": "e753ff96",
"metadata": {},
"source": [
"## Percentage of missing values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fec2d02c",
"metadata": {},
"outputs": [],
"source": [
"num_missing = float(da.isnull().sum() / da.size * 100)\n",
"print(f\"Number of missing values: {num_missing:.2f} %.\")"
"for lakeid, da_lakeid in da.groupby(\"lakeid\"):\n",
" da_lakeid.dropna(\"time\").plot(label=lakeid)\n",
"plt.legend()\n",
"plt.grid()\n",
"plt.title(\"Spatial weighted mean\")\n",
"plt.show()\n",
"\n",
"# Print missing values\n",
"missings = da.isnull().sum(\"time\") / da.sizes[\"time\"] * 100\n",
"id_digits = max(map(len, da[\"lakeid\"].astype(str).values))\n",
"for lakeid, missing in missings.groupby(\"lakeid\"):\n",
" print(f\"Missing values of lake ID {lakeid:<{id_digits}}: {missing.values:.2f} %\")"
]
},
{
"cell_type": "markdown",
"id": "055c8c64",
"id": "c46bdda8-fdd2-44fb-afc2-f3a4af6e98b6",
"metadata": {},
"source": [
"## Boxplot"
Expand All @@ -258,44 +245,21 @@
{
"cell_type": "code",
"execution_count": null,
"id": "b1656320",
"id": "4050019d-151d-4933-91af-38265578ae7b",
"metadata": {},
"outputs": [],
"source": [
"# Create a boxplot\n",
"valid_da = da.where(da.notnull().compute(), drop=True).chunk(-1)\n",
"plt.boxplot(valid_da)\n",
"\n",
"# Add title and labels\n",
"# plt.title(\"Boxplot of array with missing values\")\n",
"plt.xlabel(\"Array\")\n",
"plt.ylabel(\"lake surface skin temperature\")\n",
"df = da.to_dataframe()\n",
"df.boxplot(by=\"lakeid\")\n",
"plt.show()\n",
"\n",
"# Find 1st and 3rd quantile and median\n",
"da_qiles = valid_da.quantile([0.25, 0.5, 0.75])\n",
"\n",
"# Finding the IQR region\n",
"iqr = da_qiles.sel(quantile=0.75) - da_qiles.sel(quantile=0.25)\n",
"\n",
"# Finding upper and lower whiskers\n",
"stats = {\n",
" \"median\": float(da_qiles.sel(quantile=0.5)),\n",
" \"IQR upper bound\": float(da_qiles.sel(quantile=0.75) + (1.5 * iqr)),\n",
" \"IQR lower bound\": float(da_qiles.sel(quantile=0.25) - (1.5 * iqr)),\n",
" \"minimum\": float(da.min()),\n",
" \"maximum\": float(da.max()),\n",
"}\n",
"\n",
"# Print stats\n",
"for key, value in stats.items():\n",
" print(f\"The {key} value is {value:.2f} {valid_da.units}\")\n",
"\n",
"# Check outliers\n",
"no_outliers = (\n",
" stats[\"minimum\"] >= stats[\"IQR lower bound\"]\n",
" and stats[\"maximum\"] <= stats[\"IQR upper bound\"]\n",
")\n",
"print(f\"\\nThere are {'NO' if no_outliers else 'SOME'} outliers in the series.\")"
"# Print statistics\n",
"boxplot_stats = {}\n",
"for lakeid, df_lakeid in df.groupby(\"lakeid\"):\n",
" values = df_lakeid.dropna().values.squeeze()\n",
" (boxplot_stats[lakeid],) = matplotlib.cbook.boxplot_stats(values)\n",
"boxplot_stats = pd.DataFrame(boxplot_stats)\n",
"boxplot_stats"
]
}
],
Expand All @@ -315,7 +279,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down

0 comments on commit a7ec6f3

Please sign in to comment.