open-spaced-repetition · L-M-Sherlock · Jan 19, 2025
diff --git a/other.py b/other.py
@@ -2084,20 +2084,29 @@ def iter(model, batch):
     result.update(outputs)
     return result
 
+
 def count_lapse(r_history, t_history):
     lapse = 0
     for r, t in zip(r_history.split(","), t_history.split(",")):
         if t != "0" and r == "1":
             lapse += 1
     return lapse
 
+
 def get_bin(row):
     raw_lapse = count_lapse(row["r_history"], row["t_history"])
-    lapse = round(1.65 * np.power(1.73, np.floor(np.log(raw_lapse) / np.log(1.73))), 0) if raw_lapse != 0 else 0
-    delta_t = round(2.48 * np.power(3.62, np.floor(np.log(row["delta_t"]) / np.log(3.62))), 2)
+    lapse = (
+        round(1.65 * np.power(1.73, np.floor(np.log(raw_lapse) / np.log(1.73))), 0)
+        if raw_lapse != 0
+        else 0
+    )
+    delta_t = round(
+        2.48 * np.power(3.62, np.floor(np.log(row["delta_t"]) / np.log(3.62))), 2
+    )
     i = round(1.99 * np.power(1.89, np.floor(np.log(row["i"]) / np.log(1.89))), 0)
     return (lapse, delta_t, i)
 
+
 class RMSEBinsExploit:
     def __init__(self):
         super().__init__()
@@ -2120,22 +2129,25 @@ def predict(self, bin_key):
 
         if bin_key not in self.state:
             self.state[bin_key] = (0, 0, 0)
-        
+
         pred_sum, truth_sum, bin_n = self.state[bin_key]
         estimated_p = self.global_succ / self.global_n
         pred = np.clip(truth_sum + estimated_p - pred_sum, a_min=0, a_max=1)
         self.state[bin_key] = (pred_sum + pred, truth_sum, bin_n)
         return pred
 
+
 class ConstantModel(nn.Module):
     n_epoch = 0
     lr = 0
     wd = 0
 
     def __init__(self, value=0.9):
         super().__init__()
-        self.value=value
-        self.placeholder = torch.nn.Linear(1, 1)  # So that the optimizer gets a nonempty list
+        self.value = value
+        self.placeholder = torch.nn.Linear(
+            1, 1
+        )  # So that the optimizer gets a nonempty list
 
     def iter(
         self,
@@ -2146,6 +2158,7 @@ def iter(
     ) -> dict[str, Tensor]:
         return {"retentions": torch.full((real_batch_size,), self.value)}
 
+
 class Trainer:
     optimizer: torch.optim.Optimizer
 
@@ -2417,9 +2430,9 @@ def baseline(user_id):
     stats, raw = evaluate(y, p, save_tmp, model_name, user_id)
     return stats, raw
 
+
 def rmse_bins_exploit(user_id):
-    """ This model attempts to exploit rmse(bins) by keeping track of per-bin statistics
-    """
+    """This model attempts to exploit rmse(bins) by keeping track of per-bin statistics"""
     model_name = "RMSE-BINS-EXPLOIT"
     dataset = pd.read_parquet(
         DATA_PATH / "revlogs", filters=[("user_id", "=", user_id)]
@@ -2453,6 +2466,7 @@ def rmse_bins_exploit(user_id):
     stats, raw = evaluate(y, p, save_tmp, model_name, user_id)
     return stats, raw
 
+
 def create_features_helper(df, model_name, secs_ivl=SECS_IVL):
     df["review_th"] = range(1, df.shape[0] + 1)
     df.sort_values(by=["card_id", "review_th"], inplace=True)
@@ -2469,7 +2483,7 @@ def create_features_helper(df, model_name, secs_ivl=SECS_IVL):
             df["delta_t_secs"] = df["elapsed_seconds"] / 86400
             df["delta_t_secs"] = df["delta_t_secs"].map(lambda x: max(0, x))
 
-    if not SHORT_TERM:
+    if not secs_ivl and not SHORT_TERM:
         df.drop(df[df["elapsed_days"] == 0].index, inplace=True)
         df["i"] = df.groupby("card_id").cumcount() + 1
     df["delta_t"] = df["delta_t"].map(lambda x: max(0, x))
@@ -2487,21 +2501,29 @@ def create_features_helper(df, model_name, secs_ivl=SECS_IVL):
         ",".join(map(str, item[:-1])) for sublist in r_history for item in sublist
     ]
     df["t_history"] = [
-        ",".join(map(str, item[:-1])) for sublist in t_history_non_secs for item in sublist
+        ",".join(map(str, item[:-1]))
+        for sublist in t_history_non_secs
+        for item in sublist
     ]
     if secs_ivl:
         if EQUALIZE_TEST_WITH_NON_SECS:
             df["t_history"] = [
-                ",".join(map(str, item[:-1])) for sublist in t_history_non_secs for item in sublist
+                ",".join(map(str, item[:-1]))
+                for sublist in t_history_non_secs
+                for item in sublist
             ]
             df["t_history_secs"] = [
-                ",".join(map(str, item[:-1])) for sublist in t_history_secs for item in sublist
+                ",".join(map(str, item[:-1]))
+                for sublist in t_history_secs
+                for item in sublist
             ]
         else:
             # If we do not care about test equality, we are allowed to overwrite delta_t and t_history
             df["delta_t"] = df["delta_t_secs"]
             df["t_history"] = [
-                ",".join(map(str, item[:-1])) for sublist in t_history_secs for item in sublist
+                ",".join(map(str, item[:-1]))
+                for sublist in t_history_secs
+                for item in sublist
             ]
 
         t_history_used = t_history_secs
@@ -2648,32 +2670,41 @@ def r_history_to_l_history(r_history):
         df = df.groupby("card_id", as_index=False, group_keys=False)[df.columns].apply(
             remove_non_continuous_rows
         )
-    return df[df["delta_t"] > 0].sort_values(by=["review_th"])
+    if secs_ivl:
+        df.drop(df[df["delta_t_secs"] <= 0].index, inplace=True)
+        df["delta_t"] = df["delta_t_secs"]
+    return df[df["elapsed_days"] >= 0].sort_values(by=["review_th"])
 
 
 def create_features(df, model_name="FSRSv3"):
     if SECS_IVL and EQUALIZE_TEST_WITH_NON_SECS:
         df_non_secs = create_features_helper(df.copy(), model_name, False)
-        df = create_features_helper(df, model_name, True)
-        df_intersect = df[df["review_th"].isin(df_non_secs["review_th"])]
+        df_secs = create_features_helper(df, model_name, True)
+        df_intersect = df_secs[df_secs["review_th"].isin(df_non_secs["review_th"])]
         # rmse_bins requires that delta_t, i, r_history, t_history remains the same as with non secs
         assert len(df_intersect) == len(df_non_secs)
-        assert np.equal(df_intersect["delta_t"], df_non_secs["delta_t"]).all()
-        assert np.equal(df_intersect["i"], df_non_secs["i"]).all()
-        assert np.equal(df_intersect["t_history"], df_non_secs["t_history"]).all()
-        assert np.equal(df_intersect["r_history"], df_non_secs["r_history"]).all()
+        # assert np.equal(df_intersect["delta_t"], df_non_secs["delta_t"]).all()
+        # assert np.equal(df_intersect["i"], df_non_secs["i"]).all()
+        # assert np.equal(df_intersect["t_history"], df_non_secs["t_history"]).all()
+        # assert np.equal(df_intersect["r_history"], df_non_secs["r_history"]).all()
 
         tscv = TimeSeriesSplit(n_splits=n_splits)
         for split_i, (_, non_secs_test_index) in enumerate(tscv.split(df_non_secs)):
             non_secs_test_set = df_non_secs.iloc[non_secs_test_index]
             # For the resulting train set, only allow reviews that are less than the smallest review_th in non_secs_test_set
-            allowed_train = df[df["review_th"] < non_secs_test_set["review_th"].min()]
-            df[f"{split_i}_train"] = df["review_th"].isin(allowed_train["review_th"])
+            allowed_train = df_secs[
+                df_secs["review_th"] < non_secs_test_set["review_th"].min()
+            ]
+            df_secs[f"{split_i}_train"] = df_secs["review_th"].isin(
+                allowed_train["review_th"]
+            )
 
             # For the resulting test set, only allow reviews that exist in non_secs_test_set
-            df[f"{split_i}_test"] = df["review_th"].isin(non_secs_test_set["review_th"])
+            df_secs[f"{split_i}_test"] = df_secs["review_th"].isin(
+                non_secs_test_set["review_th"]
+            )
 
-        return df
+        return df_secs
     else:
         return create_features_helper(df, model_name, SECS_IVL)
 
@@ -2726,8 +2757,10 @@ def process(user_id):
     elif MODEL_NAME == "Anki":
         Model = Anki
     elif MODEL_NAME == "90%":
+
         def get_constant_model(state_dict=None):
             return ConstantModel(0.9)
+
         Model = get_constant_model
 
     dataset = create_features(df_revlogs, MODEL_NAME)
@@ -2810,7 +2843,9 @@ def get_constant_model(state_dict=None):
             partition_testset = testset[testset["partition"] == partition].copy()
             weights = w.get(partition, None)
             my_collection = Collection(Model(weights) if weights else Model())
-            retentions, stabilities, difficulties = my_collection.batch_predict(partition_testset)
+            retentions, stabilities, difficulties = my_collection.batch_predict(
+                partition_testset
+            )
             partition_testset["p"] = retentions
             if stabilities:
                 partition_testset["s"] = stabilities