When running saits multiple times, the same parameters cannot yield the same results. #555

liujian123223 · 2024-12-04T00:58:57Z

Issue description

If I only run the following code once：“saits_output = evaluate_model(n_layers=1, d_model=64, d_ffn=64,n_heads=1, d_k=64, d_v=64, lr=1e-4,dataset=dataset_for_testing)
”, I can get the same result by running it multiple times. But when I run a program multiple times, the output results are slightly different each time. How can I solve this problem?

import random
import numpy as np
import benchpots
from pypots.utils.random import set_random_seed
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from pypots.utils.metrics import calc_mae,calc_mse,calc_mre
from pypots.optim import Adam
from pypots.imputation import SAITS
from pygrinder import (
mcar,
mar_logistic,
mnar_x,
mnar_t,
rdo,
seq_missing,
block_missing,
calc_missing_rate
)

设置随机种子，确保结果可复现

seed = 2024
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_random_seed(seed)

#数据读取
df_origin = pd.read_csv('your.csv',index_col=0)[:24000]#第一列时间作为索引
df_origin_numpy = df_origin.values # 转换为 NumPy 数组（如果数据集是 Pandas DataFrame）

#数据归一化---应该要在加入缺失值之前归一化
scaler = StandardScaler()
df_origin_numpy = scaler.fit_transform(df_origin_numpy)
num_rows, num_cols = df_origin_numpy.shape

#将2维数据转化为3维
target_shape = 48 # 每个样本的元素个数
A = num_rows // target_shape # A应该是行数除以 144*21 后的商
if num_rows % target_shape != 0:
A = num_rows // target_shape # 使用最大整除后的 A
df_origin_numpy = df_origin_numpy[:A * target_shape, :]
df_origin_numpy = df_origin_numpy.reshape(A, 48, 25)

##数据缺失设置

df_with_missing = df_origin_numpy

df_with_missing = mcar(df_origin_numpy, p=0.3)

df_with_missing = mnar_t(df_origin_numpy, cycle=20, pos=10, scale=3)

df_with_missing = seq_missing(df_origin_numpy, p=0.3, seq_len=5)

#数据、掩码设置
dataset_for_testing = {
"X": df_with_missing,
"y": df_origin_numpy
}
test_X_indicating_mask = np.isnan(df_with_missing)
test_X_ori = np.nan_to_num(df_origin_numpy) # 将原始数据中的 NaN 转换为 0

sample_num, sequence_length, num_features = df_with_missing.shape

def calc_r2(predictions, targets, masks=None, ):
"""计算 R²（决定系数），只针对非缺失部分计算"""
if masks is not None:
# 只计算非缺失数据
predictions = predictions * masks
targets = targets * masks

# 计算残差平方和 (RSS)
residual_sum_of_squares = np.sum((targets - predictions) ** 2)

# 计算总平方和 (TSS)
total_sum_of_squares = np.sum((targets - np.mean(targets)) ** 2)

# 计算 R²
r2 = 1 - residual_sum_of_squares / (total_sum_of_squares + 1e-12)  # 防止除以零

return r2

def evaluate_model(n_layers, d_model, d_ffn,n_heads, d_k, d_v, lr,dataset):
"""
创建并训练 SAITS 模型，并返回评估结果。
"""
saits = SAITS(
n_steps=sequence_length, # 时间步长，即每个样本的长度
n_features=num_features, # 特征数量
n_layers=n_layers,
d_model=d_model,
d_ffn=d_ffn, # 假设 d_ffn 与 d_model 相同
n_heads=n_heads,
d_k=d_k,
d_v=d_v,
dropout=0.1,
ORT_weight=1, # 你可以根据需要调整这些权重
MIT_weight=1,
batch_size=8,
epochs=10,
patience=2,
optimizer=Adam(lr=lr),
num_workers=0,
device="cuda", # 自动选择设备
saving_path="tutorial_results/imputation/saits", # 保存路径
model_saving_strategy="best", # 只保存最好的模型
)

# 训练模型
saits.fit(train_set=dataset)

# 测试阶段：对缺失值进行插补
saits_results = saits.predict(dataset)
saits_imputation = saits_results["imputation"]

return saits_imputation

saits_output = evaluate_model(n_layers=1, d_model=64, d_ffn=64,n_heads=1, d_k=64, d_v=64, lr=1e-4,dataset=dataset_for_testing)

saits_output = evaluate_model(n_layers=2, d_model=64, d_ffn=64,n_heads=1, d_k=64, d_v=64, lr=1e-4,dataset=dataset_for_testing)

testing_mae = calc_mae(saits_output, test_X_ori, test_X_indicating_mask)

print(f"Testing MAE: {testing_mae:.4f}")

mse = calc_mse(saits_output, test_X_ori, test_X_indicating_mask)
print(f"Testing MSE: {mse:.4f}")

计算平均相对误差 MRE

mre = calc_mre(saits_output, test_X_ori, test_X_indicating_mask)
print(f"Testing MRE: {mre:.4f}")

R2 = calc_r2(saits_output, test_X_ori, test_X_indicating_mask)
print(f"Testing R2: {R2:.4f}")

The text was updated successfully, but these errors were encountered:

github-actions · 2024-12-04T00:59:25Z

Hi there 👋,

Thank you so much for your attention to PyPOTS! You can follow me on GitHub
to receive the latest news of PyPOTS. If you find PyPOTS helpful to your work, please star⭐️ this repository.
Your star is your recognition, which can help more people notice PyPOTS and grow PyPOTS community.
It matters and is definitely a kind of contribution to the community.

I have received your message and will respond ASAP. Thank you for your patience! 😃

Best,
Wenjie

liujian123223 · 2024-12-04T01:00:21Z

`import random
import numpy as np
import benchpots
from pypots.utils.random import set_random_seed
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from pypots.utils.metrics import calc_mae,calc_mse,calc_mre
from pypots.optim import Adam
from pypots.imputation import SAITS
from pygrinder import (
mcar,
mar_logistic,
mnar_x,
mnar_t,
rdo,
seq_missing,
block_missing,
calc_missing_rate
)

设置随机种子，确保结果可复现

seed = 2024
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_random_seed(seed)

#数据读取
df_origin = pd.read_csv('your.csv',index_col=0)[:24000]#第一列时间作为索引
df_origin_numpy = df_origin.values # 转换为 NumPy 数组（如果数据集是 Pandas DataFrame）

#数据归一化---应该要在加入缺失值之前归一化
scaler = StandardScaler()
df_origin_numpy = scaler.fit_transform(df_origin_numpy)
num_rows, num_cols = df_origin_numpy.shape

#将2维数据转化为3维
target_shape = 48 # 每个样本的元素个数
A = num_rows // target_shape # A应该是行数除以 144*21 后的商
if num_rows % target_shape != 0:
A = num_rows // target_shape # 使用最大整除后的 A
df_origin_numpy = df_origin_numpy[:A * target_shape, :]
df_origin_numpy = df_origin_numpy.reshape(A, 48, 25)

##数据缺失设置

df_with_missing = df_origin_numpy

df_with_missing = mcar(df_origin_numpy, p=0.3)

df_with_missing = mnar_t(df_origin_numpy, cycle=20, pos=10, scale=3)

df_with_missing = seq_missing(df_origin_numpy, p=0.3, seq_len=5)

#数据、掩码设置
dataset_for_testing = {
"X": df_with_missing,
"y": df_origin_numpy
}
test_X_indicating_mask = np.isnan(df_with_missing)
test_X_ori = np.nan_to_num(df_origin_numpy) # 将原始数据中的 NaN 转换为 0

sample_num, sequence_length, num_features = df_with_missing.shape

def calc_r2(predictions, targets, masks=None, ):
"""计算 R²（决定系数），只针对非缺失部分计算"""
if masks is not None:
# 只计算非缺失数据
predictions = predictions * masks
targets = targets * masks

# 计算残差平方和 (RSS)
residual_sum_of_squares = np.sum((targets - predictions) ** 2)

# 计算总平方和 (TSS)
total_sum_of_squares = np.sum((targets - np.mean(targets)) ** 2)

# 计算 R²
r2 = 1 - residual_sum_of_squares / (total_sum_of_squares + 1e-12)  # 防止除以零

return r2

def evaluate_model(n_layers, d_model, d_ffn,n_heads, d_k, d_v, lr,dataset):
"""
创建并训练 SAITS 模型，并返回评估结果。
"""
saits = SAITS(
n_steps=sequence_length, # 时间步长，即每个样本的长度
n_features=num_features, # 特征数量
n_layers=n_layers,
d_model=d_model,
d_ffn=d_ffn, # 假设 d_ffn 与 d_model 相同
n_heads=n_heads,
d_k=d_k,
d_v=d_v,
dropout=0.1,
ORT_weight=1, # 你可以根据需要调整这些权重
MIT_weight=1,
batch_size=8,
epochs=10,
patience=2,
optimizer=Adam(lr=lr),
num_workers=0,
device="cuda", # 自动选择设备
saving_path="tutorial_results/imputation/saits", # 保存路径
model_saving_strategy="best", # 只保存最好的模型
)

# 训练模型
saits.fit(train_set=dataset)

# 测试阶段：对缺失值进行插补
saits_results = saits.predict(dataset)
saits_imputation = saits_results["imputation"]

return saits_imputation

saits_output = evaluate_model(n_layers=1, d_model=64, d_ffn=64,n_heads=1, d_k=64, d_v=64, lr=1e-4,dataset=dataset_for_testing)

saits_output = evaluate_model(n_layers=2, d_model=64, d_ffn=64,n_heads=1, d_k=64, d_v=64, lr=1e-4,dataset=dataset_for_testing)

testing_mae = calc_mae(saits_output, test_X_ori, test_X_indicating_mask)

print(f"Testing MAE: {testing_mae:.4f}")

mse = calc_mse(saits_output, test_X_ori, test_X_indicating_mask)
print(f"Testing MSE: {mse:.4f}")

计算平均相对误差 MRE

mre = calc_mre(saits_output, test_X_ori, test_X_indicating_mask)
print(f"Testing MRE: {mre:.4f}")

R2 = calc_r2(saits_output, test_X_ori, test_X_indicating_mask)
print(f"Testing R2: {R2:.4f}")`

github-actions · 2024-12-19T00:18:07Z

This issue had no activity for 14 days. It will be closed in 1 week unless there is some new activity. Is this issue already resolved?

liujian123223 added the question Further information is requested label Dec 4, 2024

github-actions bot added the stale label Dec 19, 2024

github-actions bot closed this as not planned Won't fix, can't repro, duplicate, stale Dec 27, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

When running saits multiple times, the same parameters cannot yield the same results. #555

When running saits multiple times, the same parameters cannot yield the same results. #555

liujian123223 commented Dec 4, 2024

github-actions bot commented Dec 4, 2024

liujian123223 commented Dec 4, 2024

github-actions bot commented Dec 19, 2024

When running saits multiple times, the same parameters cannot yield the same results. #555

When running saits multiple times, the same parameters cannot yield the same results. #555

Comments

liujian123223 commented Dec 4, 2024

Issue description

设置随机种子，确保结果可复现

df_with_missing = df_origin_numpy

df_with_missing = mnar_t(df_origin_numpy, cycle=20, pos=10, scale=3)

df_with_missing = seq_missing(df_origin_numpy, p=0.3, seq_len=5)

计算 平均相对误差 MRE

github-actions bot commented Dec 4, 2024

liujian123223 commented Dec 4, 2024

设置随机种子，确保结果可复现

df_with_missing = df_origin_numpy

df_with_missing = mnar_t(df_origin_numpy, cycle=20, pos=10, scale=3)

df_with_missing = seq_missing(df_origin_numpy, p=0.3, seq_len=5)

计算 平均相对误差 MRE

github-actions bot commented Dec 19, 2024

计算平均相对误差 MRE

计算平均相对误差 MRE