USGS-R · jds485 · Apr 26, 2023 · Mar 7, 2023 · Mar 8, 2023 · Mar 8, 2023
diff --git a/river_dl/evaluate.py b/river_dl/evaluate.py
@@ -65,7 +65,7 @@ def rmse_logged(y_true, y_pred):
 
 def nse_logged(y_true, y_pred):
     """
-    compute the rmse of the logged data
+    compute the nse of the logged data
     :param y_true: [array-like] observed y_dataset values
     :param y_pred: [array-like] predicted y_dataset values
     :return: [float] the nse of the logged data
@@ -77,16 +77,32 @@ def nse_logged(y_true, y_pred):
 
 def kge_eval(y_true, y_pred):
     y_true, y_pred = filter_nan_preds(y_true, y_pred)
-    r, _ = pearsonr(y_pred, y_true)
-    mean_true = np.mean(y_true)
-    mean_pred = np.mean(y_pred)
-    std_true = np.std(y_true)
-    std_pred = np.std(y_pred)
-    r_component = np.square(r - 1)
-    std_component = np.square((std_pred / std_true) - 1)
-    bias_component = np.square((mean_pred / mean_true) - 1)
-    return 1 - np.sqrt(r_component + std_component + bias_component)
+    #Need to have > 1 observation to compute correlation.
+    #This could be < 2 due to percentile filtering
+    if len(y_true) > 1:
+        r, _ = pearsonr(y_pred, y_true)
+        mean_true = np.mean(y_true)
+        mean_pred = np.mean(y_pred)
+        std_true = np.std(y_true)
+        std_pred = np.std(y_pred)
+        r_component = np.square(r - 1)
+        std_component = np.square((std_pred / std_true) - 1)
+        bias_component = np.square((mean_pred / mean_true) - 1)
+        result = 1 - np.sqrt(r_component + std_component + bias_component)
+    else:
+        result = np.nan
+    return result
 
+def kge_logged(y_true, y_pred):
+    """
+    compute the kge of the logged data
+    :param y_true: [array-like] observed y_dataset values
+    :param y_pred: [array-like] predicted y_dataset values
+    :return: [float] the nse of the logged data
+    """
+    y_true, y_pred = filter_nan_preds(y_true, y_pred)
+    y_true, y_pred = filter_negative_preds(y_true, y_pred)
+    return kge_eval(np.log(y_true), np.log(y_pred))
 
 def filter_by_percentile(y_true, y_pred, percentile, less_than=True):
     """
@@ -162,12 +178,11 @@ def calc_metrics(df):
             ),
             "nse_logged": nse_logged(obs, pred),
             "kge": kge_eval(obs, pred),
-            "rmse_logged": rmse_logged(obs, pred),
-            "nse_top10": percentile_metric(obs, pred, nse_eval, 90, less_than=False),
-            "nse_bot10": percentile_metric(obs, pred, nse_eval, 10, less_than=True),
-            "nse_logged": nse_logged(obs, pred),
+            "kge_logged": kge_logged(obs, pred),
+            "kge_top10": percentile_metric(obs, pred, kge_eval, 90, less_than=False),
+            "kge_bot10": percentile_metric(obs, pred, kge_eval, 10, less_than=True),
+            "n_obs": len(obs)
         }
-
     else:
         metrics = {
             "rmse": np.nan,
@@ -182,10 +197,10 @@ def calc_metrics(df):
             "nse_bot10": np.nan,
             "nse_logged": np.nan,
             "kge": np.nan,
-            "rmse_logged": np.nan,
-            "nse_top10": np.nan,
-            "nse_bot10": np.nan,
-            "nse_logged": np.nan,
+            "kge_logged": np.nan,
+            "kge_top10": np.nan,
+            "kge_bot10": np.nan,
+            "n_obs": len(obs)
         }
     return pd.Series(metrics)
 
@@ -196,7 +211,10 @@ def partition_metrics(
         partition,
         spatial_idx_name="seg_id_nat",
         time_idx_name="date",
-        group=None,
+        group_spatially=False,
+        group_temporally=False,
+        time_aggregation=False,
+        site_based=False,
         id_dict=None,
         outfile=None,
         val_sites=None,
@@ -214,62 +232,180 @@ def partition_metrics(
         index (e.g., 'seg_id_nat')
     :param time_idx_name: [str] name of column that is used for temporal index
         (usually 'time')
-    :param group: [str or list] which group the metrics should be computed for.
-    Currently only supports 'seg_id_nat' (segment-wise metrics), 'month'
-    (month-wise metrics), ['seg_id_nat', 'month'] (metrics broken out by segment
-    and month), and None (everything is left together)
+    :param group_spatially [bool] when True, compute segment-wise metrics.
+    :param group_temporally [False or str] when False, computes metrics for 
+    the native timestep of the data. When a str, the str is passed to pd.Grouper
+    freq to group the data within the specified timestep 
+    (e.g., 'W', 'M', 'Y' for week, month, and year)
+    :param time_aggregation [bool] Only applies when group_temporally is a string.
+    when time_aggregation is True, metrics are computed by first averaging data 
+    from the native timestep to the group_temporally timestep. Only native timesteps
+    with observations are averaged. When False, metrics are computed for the 
+    native timestep of the data within the group_temporally groups (e.g., 
+    for daily data and group_temporally = 'Y', all days with observations
+    in the calendar year are used to compute a metric for that year). Note that
+    for month, 'M', this would normally have groups by year-month. We have forced
+    the groups to the 12 calendar months instead.
+    :param site_based [bool] Only applies when group_spatially is False,
+    group_temporally is a string, and time_aggregation is True. When
+    site_based is True, the average is computed for each site to get a 
+    group_temporally timeseries for each site. When False, the
+    average is computed over all sites to get a group_temporally timeseries 
+    using data from all reaches.
     :param id_dict: [dict] dictionary of id_dict where dict keys are the id
     names and dict values are the id values. These are added as columns to the
     metrics information
     :param outfile: [str] file where the metrics should be written
     :param val_sites: [list] sites to exclude from training and test metrics
     :param test_sites: [list] sites to exclude from validation and training metrics
-    :param train_sites: [list] sites to exclude from test metrics
+    :param train_sites: [list] sites to exclude from validation and test metrics
     :return: [pd dataframe] the condensed metrics
     """
     var_data = fmt_preds_obs(preds, obs_file, spatial_idx_name,
                              time_idx_name)
     var_metrics_list = []
 
     for data_var, data in var_data.items():
-        data.reset_index(inplace=True)
         # mask out validation and test sites from trn partition
-        if val_sites and partition == 'trn':
-            data = data[~data[spatial_idx_name].isin(val_sites)]
-        if test_sites and partition == 'trn':
-            data = data[~data[spatial_idx_name].isin(test_sites)]
-        # mask out test sites from val partition
-        if test_sites and partition=='val':
-            data = data[~data[spatial_idx_name].isin(test_sites)]
-        if train_sites and partition=='tst':
-            data = data[~data[spatial_idx_name].isin(train_sites)]
-        if val_sites and partition=='tst':
-            data = data[~data[spatial_idx_name].isin(val_sites)]
-
-        if not group:
+        if train_sites and partition == 'trn':
+            # simply use the train sites when specified.
+            data = data.loc[data.index
+                            .get_level_values(level=spatial_idx_name)
+                            .isin(train_sites)]
+        else:
+            #check if validation or testing sites are specified
+            if val_sites and partition == 'trn':
+                data = data.loc[~data.index
+                                .get_level_values(level=spatial_idx_name)
+                                .isin(val_sites)]
+            if test_sites and partition == 'trn':
+                data = data.loc[~data.index
+                                .get_level_values(level=spatial_idx_name)
+                                .isin(test_sites)]
+        # mask out training and test sites from val partition
+        if val_sites and partition == 'val':
+            data = data.loc[data.index
+                            .get_level_values(level=spatial_idx_name)
+                            .isin(val_sites)]
+        else:
+            if test_sites and partition=='val':
+                data = data.loc[~data.index
+                                .get_level_values(level=spatial_idx_name)
+                                .isin(test_sites)]
+            if train_sites and partition=='val':
+                data = data.loc[~data.index
+                                .get_level_values(level=spatial_idx_name)
+                                .isin(train_sites)]
+        # mask out training and validation sites from val partition
+        if test_sites and partition == 'tst':
+            data = data.loc[data.index
+                            .get_level_values(level=spatial_idx_name)
+                            .isin(test_sites)]
+        else:
+            if train_sites and partition=='tst':
+                data = data.loc[~data.index
+                                .get_level_values(level=spatial_idx_name)
+                                .isin(train_sites)]
+            if val_sites and partition=='tst':
+                data = data.loc[~data.index
+                                .get_level_values(level=spatial_idx_name)
+                                .isin(val_sites)]
+
+        if not group_spatially and not group_temporally:
             metrics = calc_metrics(data)
             # need to convert to dataframe and transpose so it looks like the
             # others
             metrics = pd.DataFrame(metrics).T
-        elif group == "seg_id_nat":
-            metrics = data.groupby(spatial_idx_name).apply(calc_metrics).reset_index()
-        elif group == "month":
-            metrics = (
-            data.groupby(
-            data[time_idx_name].dt.month)
-            .apply(calc_metrics)
-            .reset_index()
-            )
-        elif group == ["seg_id_nat", "month"]:
-            metrics = (
-            data.groupby(
-            [data[time_idx_name].dt.month,
-            spatial_idx_name])
+        elif group_spatially and not group_temporally:
+            #note: same as data.groupby(level=spatial_idx_name)
+            metrics = (data.groupby(pd.Grouper(level=spatial_idx_name))
             .apply(calc_metrics)
             .reset_index()
             )
-        else:
-            raise ValueError("group value not valid")
+        elif not group_spatially and group_temporally:
+            if time_aggregation:
+                #performance metrics computed at the group_temporally timestep
+                #for some reason, no `.` calculations are allowed after .mean(),
+                #so calc_metrics() is called first.
+                if site_based:
+                    #create a group_temporally timeseries for each observation site
+                    metrics = calc_metrics(data
+                    #filter the data to remove nans before computing the sum
+                    #so that the same days are being summed in the month.
+                    .dropna()
+                    .groupby([pd.Grouper(level=time_idx_name, freq=group_temporally),
+                             pd.Grouper(level=spatial_idx_name)])
+                    .mean()
+                    )
+                else:
+                    #create a group_temporally timeseries using data from all reaches
+                    data_sum = (data
+                    .dropna()
+                    .groupby(pd.Grouper(level=time_idx_name, freq=group_temporally))
+                    .mean()
+                    )
+                    #For some reason, with pd.Grouper the sum is computed as 0
+                    # on days with no observations. Need to remove these days
+                    # before calculating metrics. Get the indicies with 0 obs:
+                    data_count_0 = np.where(data
+                    #filter the data to remove nans before computing the sum
+                    #so that the same days are being summed in the month.
+                    .dropna()
+                    .groupby(pd.Grouper(level=time_idx_name, freq=group_temporally))
+                    .count()
+                    .reset_index()
+                    .obs == 0
+                    )[0]
+                    if len(data_count_0) > 0:
+                        data_sum = data_sum.drop(index=data_sum.index[data_count_0])
+                    metrics = calc_metrics(data_sum)
+                metrics = pd.DataFrame(metrics).T
+            else:
+                if group_temporally != 'M':
+                    #native timestep performance metrics within the group_temporally groups
+                    #This method will report one row per group_temporally group
+                    # examples: year-month-week would be a group when group_temporally is 'W'
+                    # year would be a group when group_temporally is 'Y'
+                    metrics = (data
+                    .groupby(pd.Grouper(level=time_idx_name, freq=group_temporally))
+                    .apply(calc_metrics)
+                    .reset_index()
+                    )
+                else:
+                    #This method reports one row per calendar month (1-12)
+                    metrics = (data.reset_index()
+                    .groupby(data.reset_index()[time_idx_name].dt.month)
+                    .apply(calc_metrics)
+                    .reset_index()
+                    )                
+        elif group_spatially and group_temporally:
+            if time_aggregation:
+                #performance metrics for each reach computed at the group_temporally timestep
+                data_calc = (data
+                .dropna()
+                .groupby([pd.Grouper(level=time_idx_name, freq=group_temporally),
+                          pd.Grouper(level=spatial_idx_name)])
+                .mean()
+                )
+                #unable to apply any other . functions after .mean().
+                metrics = (data_calc.groupby(pd.Grouper(level=spatial_idx_name))
+                .apply(calc_metrics)
+                .reset_index()
+                )
+            else:
+                if group_temporally != 'M':
+                    metrics = (data
+                    .groupby([pd.Grouper(level=time_idx_name, freq=group_temporally),
+                              pd.Grouper(level=spatial_idx_name)])
+                    .apply(calc_metrics)
+                    .reset_index()
+                    )
+                else:
+                    metrics = (data.reset_index()
+                    .groupby([data.reset_index()[time_idx_name].dt.month, spatial_idx_name])
+                    .apply(calc_metrics)
+                    .reset_index()
+                    ) 
 
         metrics["variable"] = data_var
         metrics["partition"] = partition
@@ -294,7 +430,10 @@ def combined_metrics(
     train_sites=None,
     spatial_idx_name="seg_id_nat",
     time_idx_name="date",
-    group=None,
+    group_spatially=False,
+    group_temporally=False,
+    time_aggregation=False,
+    site_based=False,
     id_dict=None,
     outfile=None,
 ):
@@ -318,10 +457,26 @@ def combined_metrics(
         index (e.g., 'seg_id_nat')
     :param time_idx_name: [str] name of column that is used for temporal index
         (usually 'time')
-    :param group: [str or list] which group the metrics should be computed for.
-    Currently only supports 'seg_id_nat' (segment-wise metrics), 'month'
-    (month-wise metrics), ['seg_id_nat', 'month'] (metrics broken out by segment
-    and month), and None (everything is left together)
+    :param group_spatially [bool] when True, compute segment-wise metrics.
+    :param group_temporally [False or str] when False, computes metrics for 
+    the native timestep of the data. When a str, the str is passed to pd.Grouper
+    freq to group the data within the specified timestep 
+    (e.g., 'W', 'M', 'Y' for week, month, and year)
+    :param time_aggregation [bool] Only applies when group_temporally is a string.
+    when time_aggregation is True, metrics are computed by first averaging data 
+    from the native timestep to the group_temporally timestep. Only native timesteps
+    with observations are averaged. When False, metrics are computed for the 
+    native timestep of the data within the group_temporally groups (e.g., 
+    for daily data and group_temporally = 'Y', all days with observations
+    in the calendar year are used to compute a metric for that year). Note that
+    for month, 'M', this would normally have groups by year-month. We have forced
+    the groups to the 12 calendar months instead.
+    :param site_based [bool] Only applies when group_spatially is False,
+    group_temporally is a string, and time_aggregation is True. When
+    site_based is True, the average is computed for each site to get a 
+    group_temporally timeseries for each site. When False, the
+    average is computed over all sites to get a group_temporally timeseries 
+    using data from all reaches.
     :param id_dict: [dict] dictionary of id_dict where dict keys are the id
     names and dict values are the id values. These are added as columns to the
     metrics information
@@ -353,10 +508,13 @@ def combined_metrics(
                                     spatial_idx_name=spatial_idx_name,
                                     time_idx_name=time_idx_name,
                                     id_dict=id_dict,
-                                    group=group,
+                                    group_spatially=group_spatially,
+                                    group_temporally=group_temporally,
+                                    time_aggregation=time_aggregation,
+                                    site_based=site_based,
                                     val_sites = val_sites,
                                     test_sites = test_sites,
-                                    train_sites=train_sites)
+                                    train_sites = train_sites)
         df_all.extend([metrics])
 
     df_all = pd.concat(df_all, axis=0)