Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug]: in "RecurrentPPO" not work "model.policy.evaluate_actions()" #270

Open
drulye opened this issue Jan 15, 2025 · 1 comment
Open

[Bug]: in "RecurrentPPO" not work "model.policy.evaluate_actions()" #270

drulye opened this issue Jan 15, 2025 · 1 comment
Labels
bug Something isn't working check the checklist You have checked the required items in the checklist but you didn't do what is written... more information needed Please fill the issue template completely

Comments

@drulye
Copy link

drulye commented Jan 15, 2025

🐛 Bug

in "RecurrentPPO" not work "model.policy.evaluate_actions()"

@drulye drulye added the bug Something isn't working label Jan 15, 2025
@araffin araffin added more information needed Please fill the issue template completely check the checklist You have checked the required items in the checklist but you didn't do what is written... labels Jan 16, 2025
@drulye
Copy link
Author

drulye commented Jan 16, 2025

# Importing libraries
from collections import namedtuple
from sb3_contrib import RecurrentPPO
import numpy as np
import torch as th

# !!! Set to True to SHOW ERROR !!!
SHOW_ERROR = False  # True, False

# Initialize environment and model
model = RecurrentPPO("MlpLstmPolicy", "CartPole-v1").learn(200)
env = model.get_env()

# Reset environment
observation_lst = env.reset()
observation_lst = observation_lst[0]
state = None
done = "True"
reward_sum = 0

# Run simulation for 100 steps
for step in range(0, 100, 1):

    # Predict actions and update LSTM state
    actions, state = model.predict(
        observation=np.array(object=observation_lst, dtype=np.float32),
        state=state,
        episode_start=np.array(object=[done == "True"], dtype=np.bool_),
        deterministic=True,
    )
    actions = actions.item()

    # Prepare tensors for policy evaluation
    obs = th.tensor(data=np.array(object=[observation_lst], dtype=np.float32), dtype=th.float32, device=model.policy.device)
    act = th.tensor(data=np.array(object=[actions], dtype=np.int64), dtype=th.int64, device=model.policy.device)
    net_arch = namedtuple("net_arch", ["pi", "vf"])
    pi = th.tensor(data=state[0], dtype=th.float32, device=model.policy.device)
    vf = th.tensor(data=state[1], dtype=th.float32, device=model.policy.device)
    lstm_states = net_arch(pi=pi, vf=vf)
    episode_starts = th.tensor(data=np.array(object=[done == "True"], dtype=np.bool_), dtype=th.int64, device=model.policy.device)

    # Evaluate policy outputs
    action_probability_distribution = model.policy.get_distribution(obs=obs, lstm_states=lstm_states, episode_starts=episode_starts)
    state_value_estimates_a = model.policy.predict_values(obs=obs, lstm_states=lstm_states, episode_starts=episode_starts)
    if SHOW_ERROR == True:
        state_value_estimates_b, log_probabilities_of_actions, action_distribution_entropy = model.policy.evaluate_actions(obs=obs, actions=act, lstm_states=lstm_states, episode_starts=episode_starts)

    # Extract key metrics
    _action_probability_distribution_0 = action_probability_distribution[0].distribution.probs.detach().cpu().numpy()[0][0]
    _action_probability_distribution_1 = action_probability_distribution[0].distribution.probs.detach().cpu().numpy()[0][1]
    _state_value_estimates_a = state_value_estimates_a.detach().cpu().numpy()[0][0]
    if SHOW_ERROR == True:
        _state_value_estimates_b = state_value_estimates_b.detach().cpu().numpy()[0][0]
        _log_probabilities_of_actions = log_probabilities_of_actions.detach().cpu().numpy()[0]
        _action_distribution_entropy = action_distribution_entropy.detach().cpu().numpy()[0]

    # Print information for current step
    print("----------------------------------------------------------------------------------------------------------")
    print(f"step: {step} | actions: {actions} | done: {done}")
    print("----------------------------------")
    print("ACTION_PROBABILITY_DISTRIBUTION  :", action_probability_distribution[0].distribution.probs)
    print("ACTION_PROBABILITY_DISTRIBUTION_0:", _action_probability_distribution_0)
    print("ACTION_PROBABILITY_DISTRIBUTION_1:", _action_probability_distribution_1)
    print("----------------------------------")
    print("STATE_VALUE_ESTIMATES_A          :", state_value_estimates_a)
    print("STATE_VALUE_ESTIMATES_A          :", _state_value_estimates_a)
    if SHOW_ERROR == True:
        print("----------------------------------")
        print("STATE_VALUE_ESTIMATES_B          :", state_value_estimates_b)
        print("STATE_VALUE_ESTIMATES_B          :", _state_value_estimates_b)
        print("LOG_PROBABILITIES_OF_ACTIONS     :", log_probabilities_of_actions)
        print("LOG_PROBABILITIES_OF_ACTIONS     :", _log_probabilities_of_actions)
        print("ACTION_DISTRIBUTION_ENTROPY      :", action_distribution_entropy)
        print("ACTION_DISTRIBUTION_ENTROPY      :", _action_distribution_entropy)
    print("----------------------------------------------------------------------------------------------------------")

    # Perform action in environment and retrieve results
    observation_lst, reward, done, info = env.step(actions=np.array(object=[actions], dtype=np.int64))
    observation_lst = observation_lst[0]
    reward_sum += reward[0]
    done = str(done[0])

# Print final step and total reward
print(f"final step: {step} | total reward: {reward_sum}")

# Traceback (most recent call last):
#   File "/home/drulye/vscode/project/error_rppo_evaluate_actions.py", line 48, in <module>
#     state_value_estimates_b, log_probabilities_of_actions, action_distribution_entropy = model.policy.evaluate_actions(obs=obs, actions=act, lstm_states=lstm_states, episode_starts=episode_starts)
#                                                                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#   File "/home/drulye/anaconda3/lib/python3.12/site-packages/sb3_contrib/common/recurrent/policies.py", line 331, in evaluate_actions
#     latent_pi, _ = self._process_sequence(pi_features, lstm_states.pi, episode_starts, self.lstm_actor)
#                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#   File "/home/drulye/anaconda3/lib/python3.12/site-packages/sb3_contrib/common/recurrent/policies.py", line 186, in _process_sequence
#     features_sequence = features.reshape((n_seq, -1, lstm.input_size)).swapaxes(0, 1)
#                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# RuntimeError: shape '[256, -1, 4]' is invalid for input of size 4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working check the checklist You have checked the required items in the checklist but you didn't do what is written... more information needed Please fill the issue template completely
Projects
None yet
Development

No branches or pull requests

2 participants