Skip to content
This repository has been archived by the owner on Jun 2, 2023. It is now read-only.

Edits to spatial train/val/test, additional performance metrics #211

Merged
merged 24 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
de51eea
add kge for log values, and kge for upper and lower 10th of dataset
jds485 Mar 7, 2023
e30e3cc
add args to filter the data by an earliest and latest time index
jds485 Mar 8, 2023
13ee6cb
add arg to make a strict spatial partition based on the provided val …
jds485 Mar 8, 2023
e014068
removing training sites from validation data
jds485 Mar 8, 2023
f5d132f
ensuring that this spatial data filter will work if training and vali…
jds485 Mar 8, 2023
85c6917
ensure that validation sites are not in the test set
jds485 Mar 8, 2023
37fc891
add options for biweekly, monthly, and yearly timeseries summaries, a…
jds485 Mar 10, 2023
fe69f86
add metrics, edit comments
jds485 Mar 14, 2023
81dca56
add count of observations to the metrics files
jds485 Mar 14, 2023
89ec644
change groups arg to several args that describe how to group_temporal…
jds485 Mar 21, 2023
a4930cd
revert back to 10 as minimum number of observations
jds485 Mar 29, 2023
a7e35b5
change sum to mean, and change name of function arg to time_aggregati…
jds485 Mar 29, 2023
c60adc5
update comment
jds485 Mar 29, 2023
5535ba4
change variable name from sum to mean
jds485 Apr 18, 2023
4ec8543
update parameter description
jds485 Apr 18, 2023
03a8ae0
check that trn, val, and tst partitions have unique time-site ids
jds485 Apr 18, 2023
1f1d6f7
check partitions with a nan filtering
jds485 Apr 18, 2023
029a933
make the handling of site selection consistent in evaluate and prepro…
jds485 Apr 18, 2023
71d4bfd
remove sites from partitions only if sites are not provided for those…
jds485 Apr 18, 2023
4875ffc
fix line indentation
jds485 Apr 18, 2023
2b59d01
allow overriding pretraining partition check
jds485 Apr 18, 2023
7d630f8
Merge branch 'main' into jds-edits
jds485 Apr 20, 2023
6c2f18d
add small offset for log metrics
jds485 Apr 26, 2023
c2e08ad
handle case when any of trn, val, tst are not used
jds485 Apr 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 220 additions & 62 deletions river_dl/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def rmse_logged(y_true, y_pred):

def nse_logged(y_true, y_pred):
"""
compute the rmse of the logged data
compute the nse of the logged data
:param y_true: [array-like] observed y_dataset values
:param y_pred: [array-like] predicted y_dataset values
:return: [float] the nse of the logged data
Expand All @@ -77,16 +77,32 @@ def nse_logged(y_true, y_pred):

def kge_eval(y_true, y_pred):
y_true, y_pred = filter_nan_preds(y_true, y_pred)
r, _ = pearsonr(y_pred, y_true)
mean_true = np.mean(y_true)
mean_pred = np.mean(y_pred)
std_true = np.std(y_true)
std_pred = np.std(y_pred)
r_component = np.square(r - 1)
std_component = np.square((std_pred / std_true) - 1)
bias_component = np.square((mean_pred / mean_true) - 1)
return 1 - np.sqrt(r_component + std_component + bias_component)
#Need to have > 1 observation to compute correlation.
#This could be < 2 due to percentile filtering
if len(y_true) > 1:
r, _ = pearsonr(y_pred, y_true)
mean_true = np.mean(y_true)
mean_pred = np.mean(y_pred)
std_true = np.std(y_true)
std_pred = np.std(y_pred)
r_component = np.square(r - 1)
std_component = np.square((std_pred / std_true) - 1)
bias_component = np.square((mean_pred / mean_true) - 1)
result = 1 - np.sqrt(r_component + std_component + bias_component)
else:
result = np.nan
return result

def kge_logged(y_true, y_pred):
"""
compute the kge of the logged data
:param y_true: [array-like] observed y_dataset values
:param y_pred: [array-like] predicted y_dataset values
:return: [float] the nse of the logged data
"""
y_true, y_pred = filter_nan_preds(y_true, y_pred)
y_true, y_pred = filter_negative_preds(y_true, y_pred)
return kge_eval(np.log(y_true), np.log(y_pred))

def filter_by_percentile(y_true, y_pred, percentile, less_than=True):
"""
Expand Down Expand Up @@ -162,12 +178,11 @@ def calc_metrics(df):
),
"nse_logged": nse_logged(obs, pred),
"kge": kge_eval(obs, pred),
"rmse_logged": rmse_logged(obs, pred),
"nse_top10": percentile_metric(obs, pred, nse_eval, 90, less_than=False),
"nse_bot10": percentile_metric(obs, pred, nse_eval, 10, less_than=True),
"nse_logged": nse_logged(obs, pred),
"kge_logged": kge_logged(obs, pred),
jds485 marked this conversation as resolved.
Show resolved Hide resolved
"kge_top10": percentile_metric(obs, pred, kge_eval, 90, less_than=False),
"kge_bot10": percentile_metric(obs, pred, kge_eval, 10, less_than=True),
"n_obs": len(obs)
}

else:
metrics = {
"rmse": np.nan,
Expand All @@ -182,10 +197,10 @@ def calc_metrics(df):
"nse_bot10": np.nan,
"nse_logged": np.nan,
"kge": np.nan,
"rmse_logged": np.nan,
"nse_top10": np.nan,
"nse_bot10": np.nan,
"nse_logged": np.nan,
"kge_logged": np.nan,
"kge_top10": np.nan,
"kge_bot10": np.nan,
"n_obs": len(obs)
}
return pd.Series(metrics)

Expand All @@ -196,7 +211,10 @@ def partition_metrics(
partition,
spatial_idx_name="seg_id_nat",
time_idx_name="date",
group=None,
group_spatially=False,
group_temporally=False,
time_aggregation=False,
site_based=False,
id_dict=None,
outfile=None,
val_sites=None,
Expand All @@ -214,62 +232,180 @@ def partition_metrics(
index (e.g., 'seg_id_nat')
:param time_idx_name: [str] name of column that is used for temporal index
(usually 'time')
:param group: [str or list] which group the metrics should be computed for.
Currently only supports 'seg_id_nat' (segment-wise metrics), 'month'
(month-wise metrics), ['seg_id_nat', 'month'] (metrics broken out by segment
and month), and None (everything is left together)
:param group_spatially [bool] when True, compute segment-wise metrics.
:param group_temporally [False or str] when False, computes metrics for
the native timestep of the data. When a str, the str is passed to pd.Grouper
freq to group the data within the specified timestep
(e.g., 'W', 'M', 'Y' for week, month, and year)
:param time_aggregation [bool] Only applies when group_temporally is a string.
when time_aggregation is True, metrics are computed by first averaging data
from the native timestep to the group_temporally timestep. Only native timesteps
with observations are averaged. When False, metrics are computed for the
native timestep of the data within the group_temporally groups (e.g.,
for daily data and group_temporally = 'Y', all days with observations
in the calendar year are used to compute a metric for that year). Note that
for month, 'M', this would normally have groups by year-month. We have forced
the groups to the 12 calendar months instead.
:param site_based [bool] Only applies when group_spatially is False,
jds485 marked this conversation as resolved.
Show resolved Hide resolved
group_temporally is a string, and time_aggregation is True. When
site_based is True, the average is computed for each site to get a
group_temporally timeseries for each site. When False, the
average is computed over all sites to get a group_temporally timeseries
using data from all reaches.
:param id_dict: [dict] dictionary of id_dict where dict keys are the id
names and dict values are the id values. These are added as columns to the
metrics information
:param outfile: [str] file where the metrics should be written
jds485 marked this conversation as resolved.
Show resolved Hide resolved
:param val_sites: [list] sites to exclude from training and test metrics
:param test_sites: [list] sites to exclude from validation and training metrics
:param train_sites: [list] sites to exclude from test metrics
:param train_sites: [list] sites to exclude from validation and test metrics
:return: [pd dataframe] the condensed metrics
"""
var_data = fmt_preds_obs(preds, obs_file, spatial_idx_name,
time_idx_name)
var_metrics_list = []

for data_var, data in var_data.items():
data.reset_index(inplace=True)
# mask out validation and test sites from trn partition
if val_sites and partition == 'trn':
data = data[~data[spatial_idx_name].isin(val_sites)]
if test_sites and partition == 'trn':
data = data[~data[spatial_idx_name].isin(test_sites)]
# mask out test sites from val partition
if test_sites and partition=='val':
data = data[~data[spatial_idx_name].isin(test_sites)]
if train_sites and partition=='tst':
data = data[~data[spatial_idx_name].isin(train_sites)]
if val_sites and partition=='tst':
data = data[~data[spatial_idx_name].isin(val_sites)]

if not group:
if train_sites and partition == 'trn':
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that there is an omission in the original code and that the following code should be added to correct that:

if train_sites and partition=='val':
    data = data[~data[spatial_idx_name].isin(train_sites)]

If I'm understanding them correctly, I think the suggested edits change the functionality of this section. As I read the original code, it appears that train_sites (as well as test_sites and val_sites) were sites that only appeared in that partition, but they weren't necessarily the only sites in that partition. In the revised code, it appears that if train_sites is specified, it will use only those sites in the training evaluations (and remove those sites from the test and validation partition).

If the intention is to change the functionality of train_sites, etc, then it's probably good to have a broader conversation. I am not using that option in my projects currently, but I don't know if others are.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I read the original code, it appears that train_sites (as well as test_sites and val_sites) were sites that only appeared in that partition, but they weren't necessarily the only sites in that partition.

Yes, I agree. I wasn't sure if this was intentional. I could add another explicit_spatial_split parameter here to allow for the previous functionality when False and this new functionality when True. I'll hold off on that before receiving feedback from others

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh wow. Yeah. This was definitely a bug! 😮 Thanks for catching this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@janetrbarclay: in a comment below, Jeff suggests we assume that sites within the train/val/test are the only sites in those partitions. That's also what I would expect. Do you know of anyone who is relying on the previous method wherein sites that are not within train_sites/val_sites/test_sites could be in the train/val/test partitions?

# simply use the train sites when specified.
data = data.loc[data.index
.get_level_values(level=spatial_idx_name)
.isin(train_sites)]
else:
#check if validation or testing sites are specified
if val_sites and partition == 'trn':
data = data.loc[~data.index
.get_level_values(level=spatial_idx_name)
.isin(val_sites)]
if test_sites and partition == 'trn':
data = data.loc[~data.index
.get_level_values(level=spatial_idx_name)
.isin(test_sites)]
# mask out training and test sites from val partition
if val_sites and partition == 'val':
data = data.loc[data.index
.get_level_values(level=spatial_idx_name)
.isin(val_sites)]
else:
if test_sites and partition=='val':
data = data.loc[~data.index
.get_level_values(level=spatial_idx_name)
.isin(test_sites)]
if train_sites and partition=='val':
data = data.loc[~data.index
.get_level_values(level=spatial_idx_name)
.isin(train_sites)]
# mask out training and validation sites from val partition
if test_sites and partition == 'tst':
data = data.loc[data.index
.get_level_values(level=spatial_idx_name)
.isin(test_sites)]
else:
if train_sites and partition=='tst':
data = data.loc[~data.index
.get_level_values(level=spatial_idx_name)
.isin(train_sites)]
if val_sites and partition=='tst':
data = data.loc[~data.index
.get_level_values(level=spatial_idx_name)
.isin(val_sites)]

if not group_spatially and not group_temporally:
metrics = calc_metrics(data)
# need to convert to dataframe and transpose so it looks like the
# others
metrics = pd.DataFrame(metrics).T
elif group == "seg_id_nat":
metrics = data.groupby(spatial_idx_name).apply(calc_metrics).reset_index()
elif group == "month":
metrics = (
data.groupby(
data[time_idx_name].dt.month)
.apply(calc_metrics)
.reset_index()
)
elif group == ["seg_id_nat", "month"]:
metrics = (
data.groupby(
[data[time_idx_name].dt.month,
spatial_idx_name])
elif group_spatially and not group_temporally:
#note: same as data.groupby(level=spatial_idx_name)
metrics = (data.groupby(pd.Grouper(level=spatial_idx_name))
.apply(calc_metrics)
.reset_index()
)
else:
raise ValueError("group value not valid")
elif not group_spatially and group_temporally:
if time_aggregation:
#performance metrics computed at the group_temporally timestep
#for some reason, no `.` calculations are allowed after .mean(),
#so calc_metrics() is called first.
if site_based:
#create a group_temporally timeseries for each observation site
metrics = calc_metrics(data
#filter the data to remove nans before computing the sum
#so that the same days are being summed in the month.
.dropna()
.groupby([pd.Grouper(level=time_idx_name, freq=group_temporally),
pd.Grouper(level=spatial_idx_name)])
.mean()
)
else:
#create a group_temporally timeseries using data from all reaches
data_sum = (data
.dropna()
.groupby(pd.Grouper(level=time_idx_name, freq=group_temporally))
.mean()
)
#For some reason, with pd.Grouper the sum is computed as 0
# on days with no observations. Need to remove these days
# before calculating metrics. Get the indicies with 0 obs:
data_count_0 = np.where(data
#filter the data to remove nans before computing the sum
#so that the same days are being summed in the month.
.dropna()
.groupby(pd.Grouper(level=time_idx_name, freq=group_temporally))
.count()
.reset_index()
.obs == 0
)[0]
if len(data_count_0) > 0:
data_sum = data_sum.drop(index=data_sum.index[data_count_0])
metrics = calc_metrics(data_sum)
metrics = pd.DataFrame(metrics).T
else:
if group_temporally != 'M':
#native timestep performance metrics within the group_temporally groups
#This method will report one row per group_temporally group
# examples: year-month-week would be a group when group_temporally is 'W'
# year would be a group when group_temporally is 'Y'
metrics = (data
.groupby(pd.Grouper(level=time_idx_name, freq=group_temporally))
.apply(calc_metrics)
.reset_index()
)
else:
#This method reports one row per calendar month (1-12)
metrics = (data.reset_index()
.groupby(data.reset_index()[time_idx_name].dt.month)
.apply(calc_metrics)
.reset_index()
)
elif group_spatially and group_temporally:
if time_aggregation:
#performance metrics for each reach computed at the group_temporally timestep
data_calc = (data
.dropna()
.groupby([pd.Grouper(level=time_idx_name, freq=group_temporally),
pd.Grouper(level=spatial_idx_name)])
.mean()
)
#unable to apply any other . functions after .mean().
metrics = (data_calc.groupby(pd.Grouper(level=spatial_idx_name))
.apply(calc_metrics)
.reset_index()
)
else:
if group_temporally != 'M':
metrics = (data
.groupby([pd.Grouper(level=time_idx_name, freq=group_temporally),
pd.Grouper(level=spatial_idx_name)])
.apply(calc_metrics)
.reset_index()
)
else:
metrics = (data.reset_index()
.groupby([data.reset_index()[time_idx_name].dt.month, spatial_idx_name])
.apply(calc_metrics)
.reset_index()
)

metrics["variable"] = data_var
metrics["partition"] = partition
Expand All @@ -294,7 +430,10 @@ def combined_metrics(
train_sites=None,
spatial_idx_name="seg_id_nat",
time_idx_name="date",
group=None,
group_spatially=False,
group_temporally=False,
time_aggregation=False,
site_based=False,
id_dict=None,
outfile=None,
):
Expand All @@ -318,10 +457,26 @@ def combined_metrics(
index (e.g., 'seg_id_nat')
:param time_idx_name: [str] name of column that is used for temporal index
(usually 'time')
:param group: [str or list] which group the metrics should be computed for.
Currently only supports 'seg_id_nat' (segment-wise metrics), 'month'
(month-wise metrics), ['seg_id_nat', 'month'] (metrics broken out by segment
and month), and None (everything is left together)
:param group_spatially [bool] when True, compute segment-wise metrics.
:param group_temporally [False or str] when False, computes metrics for
the native timestep of the data. When a str, the str is passed to pd.Grouper
freq to group the data within the specified timestep
(e.g., 'W', 'M', 'Y' for week, month, and year)
:param time_aggregation [bool] Only applies when group_temporally is a string.
when time_aggregation is True, metrics are computed by first averaging data
from the native timestep to the group_temporally timestep. Only native timesteps
with observations are averaged. When False, metrics are computed for the
native timestep of the data within the group_temporally groups (e.g.,
for daily data and group_temporally = 'Y', all days with observations
in the calendar year are used to compute a metric for that year). Note that
for month, 'M', this would normally have groups by year-month. We have forced
the groups to the 12 calendar months instead.
:param site_based [bool] Only applies when group_spatially is False,
group_temporally is a string, and time_aggregation is True. When
site_based is True, the average is computed for each site to get a
group_temporally timeseries for each site. When False, the
average is computed over all sites to get a group_temporally timeseries
using data from all reaches.
:param id_dict: [dict] dictionary of id_dict where dict keys are the id
names and dict values are the id values. These are added as columns to the
metrics information
Expand Down Expand Up @@ -353,10 +508,13 @@ def combined_metrics(
spatial_idx_name=spatial_idx_name,
time_idx_name=time_idx_name,
id_dict=id_dict,
group=group,
group_spatially=group_spatially,
group_temporally=group_temporally,
time_aggregation=time_aggregation,
site_based=site_based,
val_sites = val_sites,
test_sites = test_sites,
train_sites=train_sites)
train_sites = train_sites)
df_all.extend([metrics])

df_all = pd.concat(df_all, axis=0)
Expand Down
Loading