-
Notifications
You must be signed in to change notification settings - Fork 178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Bug]: in "RecurrentPPO" not work "model.policy.evaluate_actions()" #270
Labels
bug
Something isn't working
check the checklist
You have checked the required items in the checklist but you didn't do what is written...
more information needed
Please fill the issue template completely
Comments
araffin
added
more information needed
Please fill the issue template completely
check the checklist
You have checked the required items in the checklist but you didn't do what is written...
labels
Jan 16, 2025
# Importing libraries
from collections import namedtuple
from sb3_contrib import RecurrentPPO
import numpy as np
import torch as th
# !!! Set to True to SHOW ERROR !!!
SHOW_ERROR = False # True, False
# Initialize environment and model
model = RecurrentPPO("MlpLstmPolicy", "CartPole-v1").learn(200)
env = model.get_env()
# Reset environment
observation_lst = env.reset()
observation_lst = observation_lst[0]
state = None
done = "True"
reward_sum = 0
# Run simulation for 100 steps
for step in range(0, 100, 1):
# Predict actions and update LSTM state
actions, state = model.predict(
observation=np.array(object=observation_lst, dtype=np.float32),
state=state,
episode_start=np.array(object=[done == "True"], dtype=np.bool_),
deterministic=True,
)
actions = actions.item()
# Prepare tensors for policy evaluation
obs = th.tensor(data=np.array(object=[observation_lst], dtype=np.float32), dtype=th.float32, device=model.policy.device)
act = th.tensor(data=np.array(object=[actions], dtype=np.int64), dtype=th.int64, device=model.policy.device)
net_arch = namedtuple("net_arch", ["pi", "vf"])
pi = th.tensor(data=state[0], dtype=th.float32, device=model.policy.device)
vf = th.tensor(data=state[1], dtype=th.float32, device=model.policy.device)
lstm_states = net_arch(pi=pi, vf=vf)
episode_starts = th.tensor(data=np.array(object=[done == "True"], dtype=np.bool_), dtype=th.int64, device=model.policy.device)
# Evaluate policy outputs
action_probability_distribution = model.policy.get_distribution(obs=obs, lstm_states=lstm_states, episode_starts=episode_starts)
state_value_estimates_a = model.policy.predict_values(obs=obs, lstm_states=lstm_states, episode_starts=episode_starts)
if SHOW_ERROR == True:
state_value_estimates_b, log_probabilities_of_actions, action_distribution_entropy = model.policy.evaluate_actions(obs=obs, actions=act, lstm_states=lstm_states, episode_starts=episode_starts)
# Extract key metrics
_action_probability_distribution_0 = action_probability_distribution[0].distribution.probs.detach().cpu().numpy()[0][0]
_action_probability_distribution_1 = action_probability_distribution[0].distribution.probs.detach().cpu().numpy()[0][1]
_state_value_estimates_a = state_value_estimates_a.detach().cpu().numpy()[0][0]
if SHOW_ERROR == True:
_state_value_estimates_b = state_value_estimates_b.detach().cpu().numpy()[0][0]
_log_probabilities_of_actions = log_probabilities_of_actions.detach().cpu().numpy()[0]
_action_distribution_entropy = action_distribution_entropy.detach().cpu().numpy()[0]
# Print information for current step
print("----------------------------------------------------------------------------------------------------------")
print(f"step: {step} | actions: {actions} | done: {done}")
print("----------------------------------")
print("ACTION_PROBABILITY_DISTRIBUTION :", action_probability_distribution[0].distribution.probs)
print("ACTION_PROBABILITY_DISTRIBUTION_0:", _action_probability_distribution_0)
print("ACTION_PROBABILITY_DISTRIBUTION_1:", _action_probability_distribution_1)
print("----------------------------------")
print("STATE_VALUE_ESTIMATES_A :", state_value_estimates_a)
print("STATE_VALUE_ESTIMATES_A :", _state_value_estimates_a)
if SHOW_ERROR == True:
print("----------------------------------")
print("STATE_VALUE_ESTIMATES_B :", state_value_estimates_b)
print("STATE_VALUE_ESTIMATES_B :", _state_value_estimates_b)
print("LOG_PROBABILITIES_OF_ACTIONS :", log_probabilities_of_actions)
print("LOG_PROBABILITIES_OF_ACTIONS :", _log_probabilities_of_actions)
print("ACTION_DISTRIBUTION_ENTROPY :", action_distribution_entropy)
print("ACTION_DISTRIBUTION_ENTROPY :", _action_distribution_entropy)
print("----------------------------------------------------------------------------------------------------------")
# Perform action in environment and retrieve results
observation_lst, reward, done, info = env.step(actions=np.array(object=[actions], dtype=np.int64))
observation_lst = observation_lst[0]
reward_sum += reward[0]
done = str(done[0])
# Print final step and total reward
print(f"final step: {step} | total reward: {reward_sum}")
# Traceback (most recent call last):
# File "/home/drulye/vscode/project/error_rppo_evaluate_actions.py", line 48, in <module>
# state_value_estimates_b, log_probabilities_of_actions, action_distribution_entropy = model.policy.evaluate_actions(obs=obs, actions=act, lstm_states=lstm_states, episode_starts=episode_starts)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# File "/home/drulye/anaconda3/lib/python3.12/site-packages/sb3_contrib/common/recurrent/policies.py", line 331, in evaluate_actions
# latent_pi, _ = self._process_sequence(pi_features, lstm_states.pi, episode_starts, self.lstm_actor)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# File "/home/drulye/anaconda3/lib/python3.12/site-packages/sb3_contrib/common/recurrent/policies.py", line 186, in _process_sequence
# features_sequence = features.reshape((n_seq, -1, lstm.input_size)).swapaxes(0, 1)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# RuntimeError: shape '[256, -1, 4]' is invalid for input of size 4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
bug
Something isn't working
check the checklist
You have checked the required items in the checklist but you didn't do what is written...
more information needed
Please fill the issue template completely
🐛 Bug
in "RecurrentPPO" not work "model.policy.evaluate_actions()"
The text was updated successfully, but these errors were encountered: