USGS-R · jds485 · Apr 26, 2023 · Mar 7, 2023 · Mar 8, 2023 · Mar 8, 2023
diff --git a/river_dl/evaluate.py b/river_dl/evaluate.py
@@ -65,7 +65,7 @@ def rmse_logged(y_true, y_pred):
 
 def nse_logged(y_true, y_pred):
     """
-    compute the rmse of the logged data
+    compute the nse of the logged data
     :param y_true: [array-like] observed y_dataset values
     :param y_pred: [array-like] predicted y_dataset values
     :return: [float] the nse of the logged data
@@ -77,16 +77,32 @@ def nse_logged(y_true, y_pred):
 
 def kge_eval(y_true, y_pred):
     y_true, y_pred = filter_nan_preds(y_true, y_pred)
-    r, _ = pearsonr(y_pred, y_true)
-    mean_true = np.mean(y_true)
-    mean_pred = np.mean(y_pred)
-    std_true = np.std(y_true)
-    std_pred = np.std(y_pred)
-    r_component = np.square(r - 1)
-    std_component = np.square((std_pred / std_true) - 1)
-    bias_component = np.square((mean_pred / mean_true) - 1)
-    return 1 - np.sqrt(r_component + std_component + bias_component)
+    #Need to have > 1 observation to compute correlation.
+    #This could be < 2 due to percentile filtering
+    if len(y_true) > 1:
+        r, _ = pearsonr(y_pred, y_true)
+        mean_true = np.mean(y_true)
+        mean_pred = np.mean(y_pred)
+        std_true = np.std(y_true)
+        std_pred = np.std(y_pred)
+        r_component = np.square(r - 1)
+        std_component = np.square((std_pred / std_true) - 1)
+        bias_component = np.square((mean_pred / mean_true) - 1)
+        result = 1 - np.sqrt(r_component + std_component + bias_component)
+    else:
+        result = np.nan
+    return result
 
+def kge_logged(y_true, y_pred):
+    """
+    compute the kge of the logged data
+    :param y_true: [array-like] observed y_dataset values
+    :param y_pred: [array-like] predicted y_dataset values
+    :return: [float] the nse of the logged data
+    """
+    y_true, y_pred = filter_nan_preds(y_true, y_pred)
+    y_true, y_pred = filter_negative_preds(y_true, y_pred)
+    return kge_eval(np.log(y_true), np.log(y_pred))
 
 def filter_by_percentile(y_true, y_pred, percentile, less_than=True):
     """
@@ -136,7 +152,7 @@ def calc_metrics(df):
     pred = df["pred"].values
     obs, pred = filter_nan_preds(obs, pred)
 
-    if len(obs) > 10:
+    if len(obs) > 20:
         metrics = {
             "rmse": rmse_eval(obs, pred),
             "nse": nse_eval(obs, pred),
@@ -162,12 +178,11 @@ def calc_metrics(df):
             ),
             "nse_logged": nse_logged(obs, pred),
             "kge": kge_eval(obs, pred),
-            "rmse_logged": rmse_logged(obs, pred),
-            "nse_top10": percentile_metric(obs, pred, nse_eval, 90, less_than=False),
-            "nse_bot10": percentile_metric(obs, pred, nse_eval, 10, less_than=True),
-            "nse_logged": nse_logged(obs, pred),
+            "kge_logged": kge_logged(obs, pred),
+            "kge_top10": percentile_metric(obs, pred, kge_eval, 90, less_than=False),
+            "kge_bot10": percentile_metric(obs, pred, kge_eval, 10, less_than=True),
+            "n_obs": len(obs)
         }
-
     else:
         metrics = {
             "rmse": np.nan,
@@ -182,10 +197,10 @@ def calc_metrics(df):
             "nse_bot10": np.nan,
             "nse_logged": np.nan,
             "kge": np.nan,
-            "rmse_logged": np.nan,
-            "nse_top10": np.nan,
-            "nse_bot10": np.nan,
-            "nse_logged": np.nan,
+            "kge_logged": np.nan,
+            "kge_top10": np.nan,
+            "kge_bot10": np.nan,
+            "n_obs": len(obs)
         }
     return pd.Series(metrics)
 
@@ -224,27 +239,79 @@ def partition_metrics(
     :param outfile: [str] file where the metrics should be written
     :param val_sites: [list] sites to exclude from training and test metrics
     :param test_sites: [list] sites to exclude from validation and training metrics
-    :param train_sites: [list] sites to exclude from test metrics
+    :param train_sites: [list] sites to exclude from validation and test metrics
     :return: [pd dataframe] the condensed metrics
     """
     var_data = fmt_preds_obs(preds, obs_file, spatial_idx_name,
                              time_idx_name)
     var_metrics_list = []
 
     for data_var, data in var_data.items():
+        #multiindex df
+        data_multiind = data.copy(deep=True)
         data.reset_index(inplace=True)
         # mask out validation and test sites from trn partition
-        if val_sites and partition == 'trn':
-            data = data[~data[spatial_idx_name].isin(val_sites)]
-        if test_sites and partition == 'trn':
-            data = data[~data[spatial_idx_name].isin(test_sites)]
-        # mask out test sites from val partition
-        if test_sites and partition=='val':
-            data = data[~data[spatial_idx_name].isin(test_sites)]
-        if train_sites and partition=='tst':
-            data = data[~data[spatial_idx_name].isin(train_sites)]
-        if val_sites and partition=='tst':
-            data = data[~data[spatial_idx_name].isin(val_sites)]
+        if train_sites and partition == 'trn':
+            # simply use the train sites when specified.
+            data = data[data[spatial_idx_name].isin(train_sites)]
+            data_multiind = data_multiind.loc[data_multiind
+                                              .index
+                                              .get_level_values(level=spatial_idx_name)
+                                              .isin(train_sites)]
+        else:
+            #check if validation or testing sites are specified
+            if val_sites and partition == 'trn':
+                data = data[~data[spatial_idx_name].isin(val_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(val_sites)]
+            if test_sites and partition == 'trn':
+                data = data[~data[spatial_idx_name].isin(test_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(test_sites)]
+        # mask out training and test sites from val partition
+        if val_sites and partition == 'val':
+            data = data[data[spatial_idx_name].isin(val_sites)]
+            data_multiind = data_multiind.loc[data_multiind
+                                              .index
+                                              .get_level_values(level=spatial_idx_name)
+                                              .isin(val_sites)]
+        else:
+            if test_sites and partition=='val':
+                data = data[~data[spatial_idx_name].isin(test_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(test_sites)]
+            if train_sites and partition=='val':
+                data = data[~data[spatial_idx_name].isin(train_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(train_sites)]
+        # mask out training and validation sites from val partition
+        if test_sites and partition == 'tst':
+            data = data[data[spatial_idx_name].isin(test_sites)]
+            data_multiind = data_multiind.loc[data_multiind
+                                              .index
+                                              .get_level_values(level=spatial_idx_name)
+                                              .isin(test_sites)]
+        else:
+            if train_sites and partition=='tst':
+                data = data[~data[spatial_idx_name].isin(train_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(train_sites)]
+            if val_sites and partition=='tst':
+                data = data[~data[spatial_idx_name].isin(val_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(val_sites)]
 
         if not group:
             metrics = calc_metrics(data)
@@ -268,6 +335,99 @@ def partition_metrics(
             .apply(calc_metrics)
             .reset_index()
             )
+        elif group == "year":
+            metrics = (
+            data.groupby(
+            data[time_idx_name].dt.year)
+            .apply(calc_metrics)
+            .reset_index()
+            )
+        elif group == ["seg_id_nat", "year"]:
+            metrics = (
+            data.groupby(
+            [data[time_idx_name].dt.year,
+            spatial_idx_name])
+            .apply(calc_metrics)
+            .reset_index()
+            )
+        elif group == "biweekly":
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the month.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='2W'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            metrics = calc_metrics(data_calc)
+            metrics = pd.DataFrame(metrics).T
+        elif group == ["seg_id_nat", "biweekly"]:
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the year.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='2W'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            data_calc = data_calc.reset_index()
+            metrics = (data_calc
+            .groupby(spatial_idx_name)
+            .apply(calc_metrics)
+            .reset_index()
+            )
+        elif group == "monthly":
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the month.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='M'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            metrics = calc_metrics(data_calc)
+            metrics = pd.DataFrame(metrics).T
+        elif group == ["seg_id_nat", "monthly"]:
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the year.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='M'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            data_calc = data_calc.reset_index()
+            metrics = (data_calc
+            .groupby(spatial_idx_name)
+            .apply(calc_metrics)
+            .reset_index()
+            )
-        elif group == ["seg_id_nat", "monthly"]:
-            #filter the data to remove nans before computing the sum
-            #so that the same days are being summed in the year.
-            data_calc = (data_multiind.dropna()
-            .groupby(
-            [pd.Grouper(level=time_idx_name, freq='M'),
-             pd.Grouper(level=spatial_idx_name)])
-            .sum()
-            )
-            data_calc = data_calc.reset_index()
-            metrics = (data_calc
-            .groupby(spatial_idx_name)
-            .apply(calc_metrics)
-            .reset_index()
-            )
+        elif group == ["seg_id_nat", "monthly"]:
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the year.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='M'),
+             pd.Grouper(level=spatial_idx_name)])
+            .apply(calc_metrics)
+            .reset_index()
+            )
-        elif group == ["seg_id_nat", "monthly"]:
-            #filter the data to remove nans before computing the sum
-            #so that the same days are being summed in the year.
-            data_calc = (data_multiind.dropna()
-            .groupby(
-            [pd.Grouper(level=time_idx_name, freq='M'),
-             pd.Grouper(level=spatial_idx_name)])
-            .sum()
-            )
-            data_calc = data_calc.reset_index()
-            metrics = (data_calc
-            .groupby(spatial_idx_name)
-            .apply(calc_metrics)
-            .reset_index()
-            )
+        elif group == ["seg_id_nat", "monthly"]:
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the year.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='M'),
+             pd.Grouper(level=spatial_idx_name)])
+            .apply(calc_metrics)
+            .reset_index()
+            )
+        elif group == "yearly":
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the year.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='Y'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            metrics = calc_metrics(data_calc)
+            metrics = pd.DataFrame(metrics).T
+        elif group == ["seg_id_nat", "yearly"]:
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the year.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='Y'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            data_calc = data_calc.reset_index()
+            metrics = (data_calc
+            .groupby(spatial_idx_name)
+            .apply(calc_metrics)
+            .reset_index()
+            )
         else:
             raise ValueError("group value not valid")
 
@@ -321,7 +481,17 @@ def combined_metrics(
     :param group: [str or list] which group the metrics should be computed for.
     Currently only supports 'seg_id_nat' (segment-wise metrics), 'month'
     (month-wise metrics), ['seg_id_nat', 'month'] (metrics broken out by segment
-    and month), and None (everything is left together)
+    and month), 'year' (year-wise metrics), ['seg_id_nat', 'year'] 
+    (metrics broken out by segment and year), 'biweekly' (metrics for 
+    biweekly timeseries, aggregated by summing data in the original timestep)
+    'monthly' (metrics for monthly timeseries, aggregated by summing data 
+    in the original timestep), 'yearly' (metrics for yearly timeseries, 
+    aggregated by summing data in the original timestep), 
+    ["seg_id_nat", "biweekly"] (metrics for biweekly timeseries broken out 
+    by segment), ["seg_id_nat", "monthly"] (metrics for monthly timeseries broken out 
+    by segment), ["seg_id_nat", "yearly"] (metrics for yearly timeseries 
+    broken out by segment), and None (metrics in the original timestep computed 
+    across all space)
     :param id_dict: [dict] dictionary of id_dict where dict keys are the id
     names and dict values are the id values. These are added as columns to the
     metrics information
@@ -356,7 +526,7 @@ def combined_metrics(
                                     group=group,
                                     val_sites = val_sites,
                                     test_sites = test_sites,
-                                    train_sites=train_sites)
+                                    train_sites = train_sites)
         df_all.extend([metrics])
 
     df_all = pd.concat(df_all, axis=0)