Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix --secs #159

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 59 additions & 24 deletions other.py
Original file line number Diff line number Diff line change
Expand Up @@ -2084,20 +2084,29 @@ def iter(model, batch):
result.update(outputs)
return result


def count_lapse(r_history, t_history):
lapse = 0
for r, t in zip(r_history.split(","), t_history.split(",")):
if t != "0" and r == "1":
lapse += 1
return lapse


def get_bin(row):
raw_lapse = count_lapse(row["r_history"], row["t_history"])
lapse = round(1.65 * np.power(1.73, np.floor(np.log(raw_lapse) / np.log(1.73))), 0) if raw_lapse != 0 else 0
delta_t = round(2.48 * np.power(3.62, np.floor(np.log(row["delta_t"]) / np.log(3.62))), 2)
lapse = (
round(1.65 * np.power(1.73, np.floor(np.log(raw_lapse) / np.log(1.73))), 0)
if raw_lapse != 0
else 0
)
delta_t = round(
2.48 * np.power(3.62, np.floor(np.log(row["delta_t"]) / np.log(3.62))), 2
)
i = round(1.99 * np.power(1.89, np.floor(np.log(row["i"]) / np.log(1.89))), 0)
return (lapse, delta_t, i)


class RMSEBinsExploit:
def __init__(self):
super().__init__()
Expand All @@ -2120,22 +2129,25 @@ def predict(self, bin_key):

if bin_key not in self.state:
self.state[bin_key] = (0, 0, 0)

pred_sum, truth_sum, bin_n = self.state[bin_key]
estimated_p = self.global_succ / self.global_n
pred = np.clip(truth_sum + estimated_p - pred_sum, a_min=0, a_max=1)
self.state[bin_key] = (pred_sum + pred, truth_sum, bin_n)
return pred


class ConstantModel(nn.Module):
n_epoch = 0
lr = 0
wd = 0

def __init__(self, value=0.9):
super().__init__()
self.value=value
self.placeholder = torch.nn.Linear(1, 1) # So that the optimizer gets a nonempty list
self.value = value
self.placeholder = torch.nn.Linear(
1, 1
) # So that the optimizer gets a nonempty list

def iter(
self,
Expand All @@ -2146,6 +2158,7 @@ def iter(
) -> dict[str, Tensor]:
return {"retentions": torch.full((real_batch_size,), self.value)}


class Trainer:
optimizer: torch.optim.Optimizer

Expand Down Expand Up @@ -2417,9 +2430,9 @@ def baseline(user_id):
stats, raw = evaluate(y, p, save_tmp, model_name, user_id)
return stats, raw


def rmse_bins_exploit(user_id):
""" This model attempts to exploit rmse(bins) by keeping track of per-bin statistics
"""
"""This model attempts to exploit rmse(bins) by keeping track of per-bin statistics"""
model_name = "RMSE-BINS-EXPLOIT"
dataset = pd.read_parquet(
DATA_PATH / "revlogs", filters=[("user_id", "=", user_id)]
Expand Down Expand Up @@ -2453,6 +2466,7 @@ def rmse_bins_exploit(user_id):
stats, raw = evaluate(y, p, save_tmp, model_name, user_id)
return stats, raw


def create_features_helper(df, model_name, secs_ivl=SECS_IVL):
df["review_th"] = range(1, df.shape[0] + 1)
df.sort_values(by=["card_id", "review_th"], inplace=True)
Expand All @@ -2469,7 +2483,7 @@ def create_features_helper(df, model_name, secs_ivl=SECS_IVL):
df["delta_t_secs"] = df["elapsed_seconds"] / 86400
df["delta_t_secs"] = df["delta_t_secs"].map(lambda x: max(0, x))

if not SHORT_TERM:
if not secs_ivl and not SHORT_TERM:
df.drop(df[df["elapsed_days"] == 0].index, inplace=True)
df["i"] = df.groupby("card_id").cumcount() + 1
df["delta_t"] = df["delta_t"].map(lambda x: max(0, x))
Expand All @@ -2487,21 +2501,29 @@ def create_features_helper(df, model_name, secs_ivl=SECS_IVL):
",".join(map(str, item[:-1])) for sublist in r_history for item in sublist
]
df["t_history"] = [
",".join(map(str, item[:-1])) for sublist in t_history_non_secs for item in sublist
",".join(map(str, item[:-1]))
for sublist in t_history_non_secs
for item in sublist
]
if secs_ivl:
if EQUALIZE_TEST_WITH_NON_SECS:
df["t_history"] = [
",".join(map(str, item[:-1])) for sublist in t_history_non_secs for item in sublist
",".join(map(str, item[:-1]))
for sublist in t_history_non_secs
for item in sublist
]
df["t_history_secs"] = [
",".join(map(str, item[:-1])) for sublist in t_history_secs for item in sublist
",".join(map(str, item[:-1]))
for sublist in t_history_secs
for item in sublist
]
else:
# If we do not care about test equality, we are allowed to overwrite delta_t and t_history
df["delta_t"] = df["delta_t_secs"]
df["t_history"] = [
",".join(map(str, item[:-1])) for sublist in t_history_secs for item in sublist
",".join(map(str, item[:-1]))
for sublist in t_history_secs
for item in sublist
]

t_history_used = t_history_secs
Expand Down Expand Up @@ -2648,32 +2670,41 @@ def r_history_to_l_history(r_history):
df = df.groupby("card_id", as_index=False, group_keys=False)[df.columns].apply(
remove_non_continuous_rows
)
return df[df["delta_t"] > 0].sort_values(by=["review_th"])
if secs_ivl:
df.drop(df[df["delta_t_secs"] <= 0].index, inplace=True)
df["delta_t"] = df["delta_t_secs"]
return df[df["elapsed_days"] >= 0].sort_values(by=["review_th"])


def create_features(df, model_name="FSRSv3"):
if SECS_IVL and EQUALIZE_TEST_WITH_NON_SECS:
df_non_secs = create_features_helper(df.copy(), model_name, False)
df = create_features_helper(df, model_name, True)
df_intersect = df[df["review_th"].isin(df_non_secs["review_th"])]
df_secs = create_features_helper(df, model_name, True)
df_intersect = df_secs[df_secs["review_th"].isin(df_non_secs["review_th"])]
# rmse_bins requires that delta_t, i, r_history, t_history remains the same as with non secs
assert len(df_intersect) == len(df_non_secs)
assert np.equal(df_intersect["delta_t"], df_non_secs["delta_t"]).all()
assert np.equal(df_intersect["i"], df_non_secs["i"]).all()
assert np.equal(df_intersect["t_history"], df_non_secs["t_history"]).all()
assert np.equal(df_intersect["r_history"], df_non_secs["r_history"]).all()
# assert np.equal(df_intersect["delta_t"], df_non_secs["delta_t"]).all()
# assert np.equal(df_intersect["i"], df_non_secs["i"]).all()
# assert np.equal(df_intersect["t_history"], df_non_secs["t_history"]).all()
# assert np.equal(df_intersect["r_history"], df_non_secs["r_history"]).all()

tscv = TimeSeriesSplit(n_splits=n_splits)
for split_i, (_, non_secs_test_index) in enumerate(tscv.split(df_non_secs)):
non_secs_test_set = df_non_secs.iloc[non_secs_test_index]
# For the resulting train set, only allow reviews that are less than the smallest review_th in non_secs_test_set
allowed_train = df[df["review_th"] < non_secs_test_set["review_th"].min()]
df[f"{split_i}_train"] = df["review_th"].isin(allowed_train["review_th"])
allowed_train = df_secs[
df_secs["review_th"] < non_secs_test_set["review_th"].min()
]
df_secs[f"{split_i}_train"] = df_secs["review_th"].isin(
allowed_train["review_th"]
)

# For the resulting test set, only allow reviews that exist in non_secs_test_set
df[f"{split_i}_test"] = df["review_th"].isin(non_secs_test_set["review_th"])
df_secs[f"{split_i}_test"] = df_secs["review_th"].isin(
non_secs_test_set["review_th"]
)

return df
return df_secs
else:
return create_features_helper(df, model_name, SECS_IVL)

Expand Down Expand Up @@ -2726,8 +2757,10 @@ def process(user_id):
elif MODEL_NAME == "Anki":
Model = Anki
elif MODEL_NAME == "90%":

def get_constant_model(state_dict=None):
return ConstantModel(0.9)

Model = get_constant_model

dataset = create_features(df_revlogs, MODEL_NAME)
Expand Down Expand Up @@ -2810,7 +2843,9 @@ def get_constant_model(state_dict=None):
partition_testset = testset[testset["partition"] == partition].copy()
weights = w.get(partition, None)
my_collection = Collection(Model(weights) if weights else Model())
retentions, stabilities, difficulties = my_collection.batch_predict(partition_testset)
retentions, stabilities, difficulties = my_collection.batch_predict(
partition_testset
)
partition_testset["p"] = retentions
if stabilities:
partition_testset["s"] = stabilities
Expand Down