USGS-R · jds485 · Apr 26, 2023 · Mar 7, 2023 · Mar 8, 2023 · Mar 8, 2023
diff --git a/river_dl/evaluate.py b/river_dl/evaluate.py
@@ -65,7 +65,7 @@ def rmse_logged(y_true, y_pred):
 
 def nse_logged(y_true, y_pred):
     """
-    compute the rmse of the logged data
+    compute the nse of the logged data
     :param y_true: [array-like] observed y_dataset values
     :param y_pred: [array-like] predicted y_dataset values
     :return: [float] the nse of the logged data
@@ -77,16 +77,32 @@ def nse_logged(y_true, y_pred):
 
 def kge_eval(y_true, y_pred):
     y_true, y_pred = filter_nan_preds(y_true, y_pred)
-    r, _ = pearsonr(y_pred, y_true)
-    mean_true = np.mean(y_true)
-    mean_pred = np.mean(y_pred)
-    std_true = np.std(y_true)
-    std_pred = np.std(y_pred)
-    r_component = np.square(r - 1)
-    std_component = np.square((std_pred / std_true) - 1)
-    bias_component = np.square((mean_pred / mean_true) - 1)
-    return 1 - np.sqrt(r_component + std_component + bias_component)
+    #Need to have > 1 observation to compute correlation.
+    #This could be < 2 due to percentile filtering
+    if len(y_true) > 1:
+        r, _ = pearsonr(y_pred, y_true)
+        mean_true = np.mean(y_true)
+        mean_pred = np.mean(y_pred)
+        std_true = np.std(y_true)
+        std_pred = np.std(y_pred)
+        r_component = np.square(r - 1)
+        std_component = np.square((std_pred / std_true) - 1)
+        bias_component = np.square((mean_pred / mean_true) - 1)
+        result = 1 - np.sqrt(r_component + std_component + bias_component)
+    else:
+        result = np.nan
+    return result
 
+def kge_logged(y_true, y_pred):
+    """
+    compute the kge of the logged data
+    :param y_true: [array-like] observed y_dataset values
+    :param y_pred: [array-like] predicted y_dataset values
+    :return: [float] the nse of the logged data
+    """
+    y_true, y_pred = filter_nan_preds(y_true, y_pred)
+    y_true, y_pred = filter_negative_preds(y_true, y_pred)
+    return kge_eval(np.log(y_true), np.log(y_pred))
 
 def filter_by_percentile(y_true, y_pred, percentile, less_than=True):
     """
@@ -136,7 +152,7 @@ def calc_metrics(df):
     pred = df["pred"].values
     obs, pred = filter_nan_preds(obs, pred)
 
-    if len(obs) > 10:
+    if len(obs) > 20:
         metrics = {
             "rmse": rmse_eval(obs, pred),
             "nse": nse_eval(obs, pred),
@@ -162,12 +178,10 @@ def calc_metrics(df):
             ),
             "nse_logged": nse_logged(obs, pred),
             "kge": kge_eval(obs, pred),
-            "rmse_logged": rmse_logged(obs, pred),
-            "nse_top10": percentile_metric(obs, pred, nse_eval, 90, less_than=False),
-            "nse_bot10": percentile_metric(obs, pred, nse_eval, 10, less_than=True),
-            "nse_logged": nse_logged(obs, pred),
+            "kge_logged": kge_logged(obs, pred),
+            "kge_top10": percentile_metric(obs, pred, kge_eval, 90, less_than=False),
+            "kge_bot10": percentile_metric(obs, pred, kge_eval, 10, less_than=True)
         }
-
     else:
         metrics = {
             "rmse": np.nan,
@@ -182,10 +196,9 @@ def calc_metrics(df):
             "nse_bot10": np.nan,
             "nse_logged": np.nan,
             "kge": np.nan,
-            "rmse_logged": np.nan,
-            "nse_top10": np.nan,
-            "nse_bot10": np.nan,
-            "nse_logged": np.nan,
+            "kge_logged": np.nan,
+            "kge_top10": np.nan,
+            "kge_bot10": np.nan
         }
     return pd.Series(metrics)
 
@@ -224,27 +237,79 @@ def partition_metrics(
     :param outfile: [str] file where the metrics should be written
     :param val_sites: [list] sites to exclude from training and test metrics
     :param test_sites: [list] sites to exclude from validation and training metrics
-    :param train_sites: [list] sites to exclude from test metrics
+    :param train_sites: [list] sites to exclude from validation and test metrics
     :return: [pd dataframe] the condensed metrics
     """
     var_data = fmt_preds_obs(preds, obs_file, spatial_idx_name,
                              time_idx_name)
     var_metrics_list = []
 
     for data_var, data in var_data.items():
+        #multiindex df
+        data_multiind = data.copy(deep=True)
         data.reset_index(inplace=True)
         # mask out validation and test sites from trn partition
-        if val_sites and partition == 'trn':
-            data = data[~data[spatial_idx_name].isin(val_sites)]
-        if test_sites and partition == 'trn':
-            data = data[~data[spatial_idx_name].isin(test_sites)]
-        # mask out test sites from val partition
-        if test_sites and partition=='val':
-            data = data[~data[spatial_idx_name].isin(test_sites)]
-        if train_sites and partition=='tst':
-            data = data[~data[spatial_idx_name].isin(train_sites)]
-        if val_sites and partition=='tst':
-            data = data[~data[spatial_idx_name].isin(val_sites)]
+        if train_sites and partition == 'trn':
+            # simply use the train sites when specified.
+            data = data[data[spatial_idx_name].isin(train_sites)]
+            data_multiind = data_multiind.loc[data_multiind
+                                              .index
+                                              .get_level_values(level=spatial_idx_name)
+                                              .isin(train_sites)]
+        else:
+            #check if validation or testing sites are specified
+            if val_sites and partition == 'trn':
+                data = data[~data[spatial_idx_name].isin(val_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(val_sites)]
+            if test_sites and partition == 'trn':
+                data = data[~data[spatial_idx_name].isin(test_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(test_sites)]
+        # mask out training and test sites from val partition
+        if val_sites and partition == 'val':
+            data = data[data[spatial_idx_name].isin(val_sites)]
+            data_multiind = data_multiind.loc[data_multiind
+                                              .index
+                                              .get_level_values(level=spatial_idx_name)
+                                              .isin(val_sites)]
+        else:
+            if test_sites and partition=='val':
+                data = data[~data[spatial_idx_name].isin(test_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(test_sites)]
+            if train_sites and partition=='val':
+                data = data[~data[spatial_idx_name].isin(train_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(train_sites)]
+        # mask out training and validation sites from val partition
+        if test_sites and partition == 'tst':
+            data = data[data[spatial_idx_name].isin(test_sites)]
+            data_multiind = data_multiind.loc[data_multiind
+                                              .index
+                                              .get_level_values(level=spatial_idx_name)
+                                              .isin(test_sites)]
+        else:
+            if train_sites and partition=='tst':
+                data = data[~data[spatial_idx_name].isin(train_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(train_sites)]
+            if val_sites and partition=='tst':
+                data = data[~data[spatial_idx_name].isin(val_sites)]
+                data_multiind = data_multiind.loc[~data_multiind
+                                                  .index
+                                                  .get_level_values(level=spatial_idx_name)
+                                                  .isin(val_sites)]
 
         if not group:
             metrics = calc_metrics(data)
@@ -268,6 +333,54 @@ def partition_metrics(
             .apply(calc_metrics)
             .reset_index()
             )
+        elif group == "year":
+            metrics = (
+            data.groupby(
+            data[time_idx_name].dt.year)
+            .apply(calc_metrics)
+            .reset_index()
+            )
+        elif group == ["seg_id_nat", "year"]:
+            metrics = (
+            data.groupby(
+            [data[time_idx_name].dt.year,
+            spatial_idx_name])
+            .apply(calc_metrics)
+            .reset_index()
+            )
+        elif group == "biweekly":
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the month.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='2W'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            metrics = calc_metrics(data_calc)
+            metrics = pd.DataFrame(metrics).T
+        elif group == "monthly":
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the month.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='M'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            metrics = calc_metrics(data_calc)
+            metrics = pd.DataFrame(metrics).T
+        elif group == "yearly":
+            #filter the data to remove nans before computing the sum
+            #so that the same days are being summed in the year.
+            data_calc = (data_multiind.dropna()
+            .groupby(
+            [pd.Grouper(level=time_idx_name, freq='Y'),
+             pd.Grouper(level=spatial_idx_name)])
+            .sum()
+            )
+            metrics = calc_metrics(data_calc)
+            metrics = pd.DataFrame(metrics).T
         else:
             raise ValueError("group value not valid")
 
@@ -356,7 +469,7 @@ def combined_metrics(
                                     group=group,
                                     val_sites = val_sites,
                                     test_sites = test_sites,
-                                    train_sites=train_sites)
+                                    train_sites = train_sites)
         df_all.extend([metrics])
 
     df_all = pd.concat(df_all, axis=0)

diff --git a/river_dl/preproc_utils.py b/river_dl/preproc_utils.py
@@ -596,6 +596,7 @@ def prep_y_data(
     test_end_date=None,
     val_sites=None,
     test_sites=None,
+    strict_spatial_partition=False,
     spatial_idx_name="seg_id_nat",
     time_idx_name="date",
     seq_len=365,
@@ -632,9 +633,12 @@ def prep_y_data(
     :param test_end_date: [str or list] fmt: "YYYY-MM-DD"; date(s) to end test
     period (can have multiple discontinuous periods)
     :param val_sites: [list of site_ids] sites to retain for validation. These
-    sites will be witheld from training
+    sites will be witheld from training and testing
     :param test_sites: [list of site_ids] sites to retain for testing. These
     sites will be witheld from training and validation
+    :param strict_spatial_partition: [bool] when True, the test set does not 
+    contain any reaches that are used in training or validation, and the
+    validation set does not contain any reaches that are used in training or testing.
     :param seq_len: [int] length of sequences (e.g., 365)
     :param log_vars: [list-like] which variables_to_log (if any) to take log of
     :param exclude_file: [str] path to exclude file
@@ -674,10 +678,15 @@ def prep_y_data(
     # replace validation sites' (and test sites') data with np.nan
     if val_sites:
         y_trn = y_trn.where(~y_trn[spatial_idx_name].isin(val_sites))
+        y_tst = y_tst.where(~y_tst[spatial_idx_name].isin(val_sites))
+        if strict_spatial_partition:
+            y_val = y_val.where(y_val[spatial_idx_name].isin(val_sites))
 
     if test_sites:
         y_trn = y_trn.where(~y_trn[spatial_idx_name].isin(test_sites))
         y_val = y_val.where(~y_val[spatial_idx_name].isin(test_sites))
+        if strict_spatial_partition:
+            y_tst = y_tst.where(y_tst[spatial_idx_name].isin(test_sites))
 
 
     if log_vars:
@@ -757,6 +766,7 @@ def prep_all_data(
     test_end_date=None,
     val_sites=None,
     test_sites=None,
+    strict_spatial_partition=False,
     y_vars_finetune=None,
     y_vars_pretrain=None,
     spatial_idx_name="seg_id_nat",
@@ -772,6 +782,8 @@ def prep_all_data(
     log_y_vars=False,
     out_file=None,
     segs=None,
+    earliest_time=None,
+    latest_time=None,
     normalize_y=True,
     trn_offset = 1.0,
     tst_val_offset = 1.0
@@ -782,7 +794,9 @@ def prep_all_data(
     scaled to have a std of 1 and a mean of zero
     :param x_data_file: [str] path to Zarr file with x data. Data should have
     a spatial coordinate and a time coordinate that are specified in the
-    `spatial_idx_name` and `time_idx_name` arguments
+    `spatial_idx_name` and `time_idx_name` arguments. Assumes that all spaces will be used,
+    unless segs is specified. Assumes all times will be used,
+    unless an earliest_time or latest_time is specified.
     :param y_data_file: [str] observations Zarr file. Data should have a spatial
     coordinate and a time coordinate that are specified in the
     spatial_idx_name` and `time_idx_name` arguments
@@ -802,6 +816,9 @@ def prep_all_data(
     sites will be witheld from training
     :param test_sites: [list of site_ids] sites to retain for testing. These
     sites will be witheld from training and validation
+    :param strict_spatial_partition: [bool] when True, the test set does not 
+    contain any reaches that are used in training or validation, and the
+    validation set does not contain any reaches that are used in training or testing.
     :param spatial_idx_name: [str] name of column that is used for spatial
     index (e.g., 'seg_id_nat')
     :param time_idx_name: [str] name of column that is used for temporal index
@@ -827,6 +844,8 @@ def prep_all_data(
     :param log_y_vars: [bool] whether or not to take the log of discharge in
     training
     :param segs: [list-like] which segments to prepare the data for
+    :param earliest_time: [str] when specified, filters the x_data to remove earlier times
+    :param latest_time: [str] when specified, filters the x_data to remove later times
     :param normalize_y: [bool] whether or not to normalize the y_dataset values
     :param out_file: [str] file to where the values will be written
     :param trn_offset: [str] value for the training offset
@@ -867,15 +886,25 @@ def prep_all_data(
 
     if segs:
         x_data = x_data.sel({spatial_idx_name: segs})
+
+    if earliest_time:
+        mask_etime = (x_data[time_idx_name] >= np.datetime64(earliest_time))
+        x_data = x_data.where(mask_etime, drop=True)
+
+    if latest_time:
+        mask_ltime = (x_data[time_idx_name] <= np.datetime64(latest_time))
+        x_data = x_data.where(mask_ltime, drop=True)
 
     x_data = x_data[x_vars]
 
     if catch_prop_file:
         x_data = prep_catch_props(x_data, catch_prop_file, catch_prop_vars, spatial_idx_name)
         #update the list of x_vars
         x_vars = list(x_data.data_vars)
+
     # make sure we don't have any weird or missing input values
     check_if_finite(x_data)
+
     x_trn, x_val, x_tst = separate_trn_tst(
         x_data,
         time_idx_name,
@@ -886,7 +915,7 @@ def prep_all_data(
         test_start_date,
         test_end_date,
     )
-
+    
     x_trn_scl, x_std, x_mean = scale(x_trn)
 
     x_scl, _, _ = scale(x_data,std=x_std,mean=x_mean)
@@ -1001,6 +1030,7 @@ def prep_all_data(
             test_end_date=test_end_date,
             val_sites=val_sites,
             test_sites=test_sites,
+            strict_spatial_partition=strict_spatial_partition,
             spatial_idx_name=spatial_idx_name,
             time_idx_name=time_idx_name,
             seq_len=seq_len,