From 0736be9bdb88846f84fbc47715398af638b284a8 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas <30759571+Kallinteris-Andreas@users.noreply.github.com> Date: Mon, 24 Jul 2023 16:37:51 +0300 Subject: [PATCH] Add MuJoCo v5 environments (#572) --- gymnasium/envs/__init__.py | 80 ++- gymnasium/envs/mujoco/ant_v4.py | 187 ----- gymnasium/envs/mujoco/ant_v5.py | 437 +++++++++++ gymnasium/envs/mujoco/assets/walker2d_v5.xml | 68 ++ gymnasium/envs/mujoco/half_cheetah_v4.py | 121 ---- gymnasium/envs/mujoco/half_cheetah_v5.py | 291 ++++++++ gymnasium/envs/mujoco/hopper_v4.py | 127 ---- gymnasium/envs/mujoco/hopper_v5.py | 359 ++++++++++ gymnasium/envs/mujoco/humanoid_v4.py | 243 ------- gymnasium/envs/mujoco/humanoid_v5.py | 548 ++++++++++++++ gymnasium/envs/mujoco/humanoidstandup_v4.py | 230 ------ gymnasium/envs/mujoco/humanoidstandup_v5.py | 491 +++++++++++++ .../mujoco/inverted_double_pendulum_v4.py | 111 --- .../mujoco/inverted_double_pendulum_v5.py | 242 +++++++ gymnasium/envs/mujoco/inverted_pendulum_v4.py | 81 --- gymnasium/envs/mujoco/inverted_pendulum_v5.py | 205 ++++++ gymnasium/envs/mujoco/pusher_v4.py | 127 ---- gymnasium/envs/mujoco/pusher_v5.py | 271 +++++++ gymnasium/envs/mujoco/reacher_v4.py | 111 --- gymnasium/envs/mujoco/reacher_v5.py | 239 +++++++ gymnasium/envs/mujoco/swimmer_v4.py | 117 --- gymnasium/envs/mujoco/swimmer_v5.py | 285 ++++++++ gymnasium/envs/mujoco/walker2d_v4.py | 132 ---- gymnasium/envs/mujoco/walker2d_v5.py | 354 +++++++++ tests/envs/mujoco/test_mujoco_v5.py | 676 ++++++++++++++++++ tests/envs/registration/test_make.py | 2 +- 26 files changed, 4546 insertions(+), 1589 deletions(-) create mode 100644 gymnasium/envs/mujoco/ant_v5.py create mode 100644 gymnasium/envs/mujoco/assets/walker2d_v5.xml create mode 100644 gymnasium/envs/mujoco/half_cheetah_v5.py create mode 100644 gymnasium/envs/mujoco/hopper_v5.py create mode 100644 gymnasium/envs/mujoco/humanoid_v5.py create mode 100644 gymnasium/envs/mujoco/humanoidstandup_v5.py create mode 100644 gymnasium/envs/mujoco/inverted_double_pendulum_v5.py create mode 100644 gymnasium/envs/mujoco/inverted_pendulum_v5.py create mode 100644 gymnasium/envs/mujoco/pusher_v5.py create mode 100644 gymnasium/envs/mujoco/reacher_v5.py create mode 100644 gymnasium/envs/mujoco/swimmer_v5.py create mode 100644 gymnasium/envs/mujoco/walker2d_v5.py create mode 100644 tests/envs/mujoco/test_mujoco_v5.py diff --git a/gymnasium/envs/__init__.py b/gymnasium/envs/__init__.py index c41b76b9c..e28a0cb35 100644 --- a/gymnasium/envs/__init__.py +++ b/gymnasium/envs/__init__.py @@ -180,7 +180,7 @@ # Mujoco # ---------------------------------------- -# 2D +# manipulation register( id="Reacher-v2", @@ -196,6 +196,13 @@ reward_threshold=-3.75, ) +register( + id="Reacher-v5", + entry_point="gymnasium.envs.mujoco.reacher_v5:ReacherEnv", + max_episode_steps=50, + reward_threshold=-3.75, +) + register( id="Pusher-v2", entry_point="gymnasium.envs.mujoco:PusherEnv", @@ -210,6 +217,15 @@ reward_threshold=0.0, ) +register( + id="Pusher-v5", + entry_point="gymnasium.envs.mujoco.pusher_v5:PusherEnv", + max_episode_steps=100, + reward_threshold=0.0, +) + +# balance + register( id="InvertedPendulum-v2", entry_point="gymnasium.envs.mujoco:InvertedPendulumEnv", @@ -224,6 +240,13 @@ reward_threshold=950.0, ) +register( + id="InvertedPendulum-v5", + entry_point="gymnasium.envs.mujoco.inverted_pendulum_v5:InvertedPendulumEnv", + max_episode_steps=1000, + reward_threshold=950.0, +) + register( id="InvertedDoublePendulum-v2", entry_point="gymnasium.envs.mujoco:InvertedDoublePendulumEnv", @@ -238,6 +261,15 @@ reward_threshold=9100.0, ) +register( + id="InvertedDoublePendulum-v5", + entry_point="gymnasium.envs.mujoco.inverted_double_pendulum_v5:InvertedDoublePendulumEnv", + max_episode_steps=1000, + reward_threshold=9100.0, +) + +# runners + register( id="HalfCheetah-v2", entry_point="gymnasium.envs.mujoco:HalfCheetahEnv", @@ -259,6 +291,13 @@ reward_threshold=4800.0, ) +register( + id="HalfCheetah-v5", + entry_point="gymnasium.envs.mujoco.half_cheetah_v5:HalfCheetahEnv", + max_episode_steps=1000, + reward_threshold=4800.0, +) + register( id="Hopper-v2", entry_point="gymnasium.envs.mujoco:HopperEnv", @@ -280,6 +319,13 @@ reward_threshold=3800.0, ) +register( + id="Hopper-v5", + entry_point="gymnasium.envs.mujoco.hopper_v5:HopperEnv", + max_episode_steps=1000, + reward_threshold=3800.0, +) + register( id="Swimmer-v2", entry_point="gymnasium.envs.mujoco:SwimmerEnv", @@ -301,6 +347,13 @@ reward_threshold=360.0, ) +register( + id="Swimmer-v5", + entry_point="gymnasium.envs.mujoco.swimmer_v5:SwimmerEnv", + max_episode_steps=1000, + reward_threshold=360.0, +) + register( id="Walker2d-v2", max_episode_steps=1000, @@ -319,6 +372,12 @@ entry_point="gymnasium.envs.mujoco.walker2d_v4:Walker2dEnv", ) +register( + id="Walker2d-v5", + max_episode_steps=1000, + entry_point="gymnasium.envs.mujoco.walker2d_v5:Walker2dEnv", +) + register( id="Ant-v2", entry_point="gymnasium.envs.mujoco:AntEnv", @@ -340,6 +399,13 @@ reward_threshold=6000.0, ) +register( + id="Ant-v5", + entry_point="gymnasium.envs.mujoco.ant_v5:AntEnv", + max_episode_steps=1000, + reward_threshold=6000.0, +) + register( id="Humanoid-v2", entry_point="gymnasium.envs.mujoco:HumanoidEnv", @@ -358,6 +424,12 @@ max_episode_steps=1000, ) +register( + id="Humanoid-v5", + entry_point="gymnasium.envs.mujoco.humanoid_v5:HumanoidEnv", + max_episode_steps=1000, +) + register( id="HumanoidStandup-v2", entry_point="gymnasium.envs.mujoco:HumanoidStandupEnv", @@ -370,6 +442,12 @@ max_episode_steps=1000, ) +register( + id="HumanoidStandup-v5", + entry_point="gymnasium.envs.mujoco.humanoidstandup_v5:HumanoidStandupEnv", + max_episode_steps=1000, +) + # --- For shimmy compatibility def _raise_shimmy_error(*args: Any, **kwargs: Any): diff --git a/gymnasium/envs/mujoco/ant_v4.py b/gymnasium/envs/mujoco/ant_v4.py index 4bec72528..310c444ce 100644 --- a/gymnasium/envs/mujoco/ant_v4.py +++ b/gymnasium/envs/mujoco/ant_v4.py @@ -11,193 +11,6 @@ class AntEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the environment introduced by Schulman, - Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control - Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438). - The ant is a 3D robot consisting of one torso (free rotational body) with - four legs attached to it with each leg having two body parts. The goal is to - coordinate the four legs to move in the forward (right) direction by applying - torques on the eight hinges connecting the two body parts of each leg and the torso - (nine body parts and eight hinges). - - ## Action Space - The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | - | 0 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | - | 1 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | - | 2 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | - | 3 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | - | 7 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the ant, - followed by the velocities of those individual parts (their derivatives) with all - the positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the ant's torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (29,), float64)` where the first two observations - represent the x- and y- coordinates of the ant's torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, observation Space is a `Box(-Inf, Inf, (27,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 21 | angular velocity of angle between torso and front right link | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 26 | angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - - - If version < `v4` or `use_contact_forces` is `True` then the observation space is extended by 14*6 = 84 elements, which are contact forces - (external forces - force x, y, z and torque x, y, z) applied to the - center of mass of each of the body parts. The 14 body parts are: - - | id (for `v2`, `v3`, `v4)` | body parts | - | --- | ------------ | - | 0 | worldbody (note: forces are always full of zeros) | - | 1 | torso | - | 2 | front_left_leg | - | 3 | aux_1 (front left leg) | - | 4 | ankle_1 (front left leg) | - | 5 | front_right_leg | - | 6 | aux_2 (front right leg) | - | 7 | ankle_2 (front right leg) | - | 8 | back_leg (back left leg) | - | 9 | aux_3 (back left leg) | - | 10 | ankle_3 (back left leg) | - | 11 | right_back_leg | - | 12 | aux_4 (back right leg) | - | 13 | ankle_4 (back right leg) | - - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - - **Note:** Ant-v4 environment no longer has the following contact forces issue. - If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results - in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 - when using the Ant environment if you would like to report results with contact forces (if - contact forces are not used in your experiments, you can use version > 2.0). - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward` - - *forward_reward*: A reward of moving forward which is measured as - *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time - between actions and is dependent on the `frame_skip` parameter (default is 5), - where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*. - This reward would be positive if the ant moves forward (in positive x direction). - - *ctrl_cost*: A negative reward for penalising the ant if it takes actions - that are too large. It is measured as *`ctrl_cost_weight` * sum(action2)* - where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5. - - *contact_cost*: A negative reward for penalising the ant if the external contact - force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact - force to `contact_force_range`)2)*. - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. - - But if `use_contact_forces=True` or version < `v4` - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. - - In either case `info` will also contain the individual reward terms. - - ## Starting State - All observations start in state - (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise - with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for - stochasticity. Note that the initial z coordinate is intentionally selected - to be slightly high, thereby indicating a standing up ant. The initial orientation - is designed to make it face forward as well. - - ## Episode End - The ant is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0]) - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The ant is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Ant-v2') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Ant-v4', ctrl_cost_weight=0.1, ...) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|--------------|-------------------------------| - | `xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model | - | `ctrl_cost_weight` | **float** | `0.5` | Weight for *ctrl_cost* term (see section on reward) | - | `use_contact_forces` | **bool** | `False` | If true, it extends the observation space by adding contact forces (see `Observation Space` section) and includes contact_cost to the reward function (see `Rewards` section) | - | `contact_cost_weight` | **float** | `5e-4` | Weight for *contact_cost* term (see section on reward) | - | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | - | `healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range | - | `contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3, also removed contact forces from the default observation space (new variable `use_contact_forces=True` can restore them) - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py new file mode 100644 index 000000000..d4f3435d5 --- /dev/null +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -0,0 +1,437 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict, Tuple, Union + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "distance": 4.0, +} + + +class AntEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the environment introduced by Schulman, + Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control + Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438). + The ant is a 3D robot consisting of one torso (free rotational body) with + four legs attached to it with each leg having two body parts. The goal is to + coordinate the four legs to move in the forward (right) direction by applying + torques on the eight hinges connecting the two body parts of each leg and the torso + (nine body parts and eight hinges). + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Ant-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Ant-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Ant-v3 | `mujoco-py` | Maintained for reproducibility | + | Ant-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + ## Action Space + The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | + | 1 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | + | 2 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | + | 3 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | + | 7 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | + + + ## Observation Space + The observation Space consists of the following parts (in order): + + - qpos (13 elements by default):* Position values of the robots's body parts. + - qvel (14 elements):* The velocities of these individual body parts, + (their derivatives). + - *cfrc_ext (78 elements):* This is the center of mass based external forces on the body parts. + It has shape 13 * 6 (*nbody * 6*) and hence adds to another 78 elements in the state space. + (external forces - force x, y, z and torque x, y, z) + + By default, the observation does not include the x- and y-coordinates of the torso. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (107,), float64)`, where the first two observations are the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, however, the observation space is a `Box(-Inf, Inf, (105,), float64)`, where the position and velocity elements are as follows: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + | 13 | x-coordinate velocity of the torso | -Inf | Inf | root | free | velocity (m/s) | + | 14 | y-coordinate velocity of the torso | -Inf | Inf | root | free | velocity (m/s) | + | 15 | z-coordinate velocity of the torso | -Inf | Inf | root | free | velocity (m/s) | + | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | root | free | angular velocity (rad/s) | + | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | root | free | angular velocity (rad/s) | + | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | root | free | angular velocity (rad/s) | + | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 21 | angular velocity of angle between torso and front right link | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 26 | angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + + The body parts are: + + | body part | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ----------------------- | --- | --- | + | worldbody (note: all values are constant 0) | 0 |excluded| + | torso | 1 |0 | + | front_left_leg | 2 |1 | + | aux_1 (front left leg) | 3 |2 | + | ankle_1 (front left leg) | 4 |3 | + | front_right_leg | 5 |4 | + | aux_2 (front right leg) | 6 |5 | + | ankle_2 (front right leg) | 7 |6 | + | back_leg (back left leg) | 8 |7 | + | aux_3 (back left leg) | 9 |8 | + | ankle_3 (back left leg) | 10 |9 | + | right_back_leg | 11 |10 | + | aux_4 (back right leg) | 12 |11 | + | ankle_4 (back right leg) | 13 |12 | + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints in the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + + **Note:** + When using Ant-v3 or earlier versions, problems have been reported when using a Mujoco-Py version > 2.0 results + in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 + when using the Ant environment if you would like to report results with contact forces (if + contact forces are not used in your experiments, you can use version > 2.0). + + + ## Rewards + The total reward is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. + + - *healthy_reward*: + Every timestep that the Ant is healthy (see definition in section "Episode Termination"), + it gets a reward of fixed value `healthy_reward`. + - *forward_reward*: + A reward for moving forward, + this reward would be positive if the Ant moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the `main_body` ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions, which is depends on the `frame_skip` parameter (default is 5), + and `frametime`, which is 0.01 - so the default is $dt = 5 \times 0.01 = 0.05$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward to penalize the Ant for taking actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $0.5$). + - *contact_cost*: + A negative reward to penalize the Ant if the external contact forces are too large. + $w_{contact} \times \\|F_{contact}\\|_2^2$, where + $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-4}$), + $F_{contact}$ are the external contact forces clipped by `contact_force_range` (see `cfrc_ext` section on observation). + + `info` contains the individual reward terms. + + But if `use_contact_forces=false` on `v4` + The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. + + + ## Starting State + The initial position state is $[0.0, 0.0, 0.75, 1.0, 0.0, ... 0.0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{15}, reset\_noise\_scale \times 1_{15}]}$. + The initial velocity state is $\mathcal{N}(0_{14}, reset\_noise\_scale^2 \times I_{14})$. + + where $\mathcal{N}$ is the multivariate normal distribution and $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z- and x-coordinates are non-zero so that the ant can immediately stand up and face forward (x-axis). + + + ## Episode End + #### Termination + If `terminate_when_unhealthy is True` (the default), the environment terminates when the Ant is unhealthy. + the Ant is unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite + 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0]) + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + Ant provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('Ant-v5', ctrl_cost_weight=0.5, ...) + ``` + + | Parameter | Type | Default |Description | + |--------------------------------------------|------------|--------------|-------------------------------| + |`xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model | + |`forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward)| + |`ctrl_cost_weight` | **float** | `0.5` | Weight for _ctrl_cost_ term (see section on reward) | + |`contact_cost_weight` | **float** | `5e-4` | Weight for _contact_cost_ term (see section on reward) | + |`healthy_reward` | **float** | `1` | Weight for _healthy_reward_ term (see section on reward) | + |`main_body` |**str|int** | `1`("torso") | Name or ID of the body, whose displacement is used to calculate the *dx*/_forward_reward_ (useful for custom MuJoCo models)| + |`terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | + |`healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range | + |`contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* | + |`reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + |`exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + |`include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | + |`use_contact_forces` (`v4` only) | **bool** | `False` | If true, it extends the observation space by adding contact forces (see `Observation Space` section) and includes contact_cost to the reward function (see `Rewards` section) | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Ant is unhealthy), now it is only given when the Ant is healthy. The `info["reward_survive"]` is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - The reward function now always includes `contact_cost`, before it was only included if `use_contact_forces=True` (can be set to `0` with `contact_cost_weight=0`). + - Excluded the `cfrc_ext` of `worldbody` from the observation space as it was always 0, and thus provided no useful information to the agent, resulting is slightly faster training (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Added the `main_body` argument, which specifies the body used to compute the forward reward (mainly useful for custom MuJoCo models). + - Added the `forward_reward_weight` argument, which defaults to `1` (effectively the same behavior as in `v4`). + - Added the `include_cfrc_ext_in_observation` argument, previously in `v4` the inclusion of `cfrc_ext` observations was controlled by `use_contact_forces` which defaulted to `False`, while `include_cfrc_ext_in_observation` defaults to `True`. + - Removed the `use_contact_forces` argument (note: its functionality has been replaced by `include_cfrc_ext_in_observation` and `contact_cost_weight`) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/214)). + - Fixed `info["reward_ctrl"]` sometimes containing `contact_cost` instead of `ctrl_cost`. + - Fixed `info["x_position"]` & `info["y_position"]` & `info["distance_from_origin"]` giving `xpos` instead of `qpos` observations (`xpos` observations are behind 1 `mj_step()` more [here](https://github.com/deepmind/mujoco/issues/889#issuecomment-1568896388)) (related [Github issue #1](https://github.com/Farama-Foundation/Gymnasium/issues/521) & [Github issue #2](https://github.com/Farama-Foundation/Gymnasium/issues/539)). + - Removed `info["forward_reward"]` as it is equivalent to `info["reward_forward"]`. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3, also removed contact forces from the default observation space (new variable `use_contact_forces=True` can restore them) + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "ant.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1, + ctrl_cost_weight: float = 0.5, + contact_cost_weight: float = 5e-4, + healthy_reward: float = 1.0, + main_body: Union[int, str] = 1, + terminate_when_unhealthy: bool = True, + healthy_z_range: Tuple[float, float] = (0.2, 1.0), + contact_force_range: Tuple[float, float] = (-1.0, 1.0), + reset_noise_scale: float = 0.1, + exclude_current_positions_from_observation: bool = True, + include_cfrc_ext_in_observation: bool = True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + contact_cost_weight, + healthy_reward, + main_body, + terminate_when_unhealthy, + healthy_z_range, + contact_force_range, + reset_noise_scale, + exclude_current_positions_from_observation, + include_cfrc_ext_in_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._contact_cost_weight = contact_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._contact_force_range = contact_force_range + + self._main_body = main_body + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + self._include_cfrc_ext_in_observation = include_cfrc_ext_in_observation + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, # needs to be defined after + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = self.data.qpos.size + self.data.qvel.size + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation + + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + "cfrc_ext": self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def contact_forces(self): + raw_contact_forces = self.data.cfrc_ext + min_value, max_value = self._contact_force_range + contact_forces = np.clip(raw_contact_forces, min_value, max_value) + return contact_forces + + @property + def contact_cost(self): + contact_cost = self._contact_cost_weight * np.sum( + np.square(self.contact_forces) + ) + return contact_cost + + @property + def is_healthy(self): + state = self.state_vector() + min_z, max_z = self._healthy_z_range + is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + return terminated + + def step(self, action): + xy_position_before = self.data.body(self._main_body).xpos[:2].copy() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.data.body(self._main_body).xpos[:2].copy() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + forward_reward = x_velocity * self._forward_reward_weight + healthy_reward = self.healthy_reward + rewards = forward_reward + healthy_reward + + ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + costs = ctrl_cost + contact_cost + + observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_contact": -contact_cost, + "reward_survive": healthy_reward, + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + if self._include_cfrc_ext_in_observation: + contact_force = self.contact_forces[1:].flat.copy() + return np.concatenate((position, velocity, contact_force)) + else: + return np.concatenate((position, velocity)) + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = ( + self.init_qvel + + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv) + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + } diff --git a/gymnasium/envs/mujoco/assets/walker2d_v5.xml b/gymnasium/envs/mujoco/assets/walker2d_v5.xml new file mode 100644 index 000000000..12baef14b --- /dev/null +++ b/gymnasium/envs/mujoco/assets/walker2d_v5.xml @@ -0,0 +1,68 @@ + + + + + + + + diff --git a/gymnasium/envs/mujoco/half_cheetah_v4.py b/gymnasium/envs/mujoco/half_cheetah_v4.py index f90d59065..bc835c0ee 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v4.py +++ b/gymnasium/envs/mujoco/half_cheetah_v4.py @@ -13,127 +13,6 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the work by P. WawrzyƄski in - ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf). - The HalfCheetah is a 2-dimensional robot consisting of 9 body parts and 8 - joints connecting them (including two paws). The goal is to apply a torque - on the joints to make the cheetah run forward (right) as fast as possible, - with a positive reward allocated based on the distance moved forward and a - negative reward allocated for moving backward. The torso and head of the - cheetah are fixed, and the torque can only be applied on the other 6 joints - over the front and back thighs (connecting to the torso), shins - (connecting to the thighs) and feet (connecting to the shins). - - ## Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | - | 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) | - | 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) | - | 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) | - | 3 | Torque applied on the front thigh rotor | -1 | 1 | fthigh | hinge | torque (N m) | - | 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) | - | 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) | - - - ## Observation Space - Observations consist of positional values of different body parts of the - cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the cheetah's `rootx`. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (18,), float64)` where the first element - represents the `rootx`. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the - will be returned in `info` with key `"x_position"`. - - However, by default, the observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | - | 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) | - | 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) | - | 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) | - | 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) | - | 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) | - | 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) | - | 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) | - | 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) | - | 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angular velocity (rad/s) | - | 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | - | 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | - | excluded | x-coordinate of the front tip | -Inf | Inf | rootx | slide | position (m) | - - ## Rewards - The reward consists of two parts: - - *forward_reward*: A reward of moving forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (fixed to 5), where the frametime is 0.01 - making the - default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah - runs forward (right). - - *ctrl_cost*: A cost for penalising the cheetah if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 0.1 - - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the - initial state for stochasticity. As seen before, the first 8 values in the - state are positional and the last 9 values are velocity. A uniform noise in - the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard - normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the - initial velocity values of all zeros. - - ## Episode End - The episode truncates when the episode length is greater than 1000. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('HalfCheetah-v2') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('HalfCheetah-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"half_cheetah.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ weight (see section on reward) | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py new file mode 100644 index 000000000..7ded23ec3 --- /dev/null +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -0,0 +1,291 @@ +__credits__ = ["Kallinteris-Andreas", "Rushiv Arora"] + +from typing import Dict + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "distance": 4.0, +} + + +class HalfCheetahEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the work by P. WawrzyƄski in + ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf). + The HalfCheetah is a 2-dimensional robot consisting of 9 body parts and 8 + joints connecting them (including two paws). The goal is to apply a torque + on the joints to make the cheetah run forward (right) as fast as possible, + with a positive reward allocated based on the distance moved forward and a + negative reward allocated for moving backward. The torso and head of the + cheetah are fixed, and the torque can only be applied on the other 6 joints + over the front and back thighs (connecting to the torso), shins + (connecting to the thighs) and feet (connecting to the shins). + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | HalfCheetah-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | HalfCheetah-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | HalfCheetah-v3 | `mujoco-py` | Maintained for reproducibility | + | HalfCheetah-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | --------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) | + | 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) | + | 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) | + | 3 | Torque applied on the front thigh rotor | -1 | 1 | fthigh | hinge | torque (N m) | + | 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) | + | 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) | + + + ## Observation Space + The observation Space consists of the following parts (in order): + + - qpos (8 elements by default):* Position values of the robots's body parts. + - qvel (9 elements):* The velocities of these individual body parts, + (their derivatives). + + By default, the observation does not include the robot's x-coordinate (`rootx`). + This can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (18,), float64)`, where the first observation element is the x--coordinate of the robot. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + However, by default, the observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | + | 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) | + | 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) | + | 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) | + | 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) | + | 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) | + | 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) | + | 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) | + | 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) | + | 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angular velocity (rad/s) | + | 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | + | 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the front tip | -Inf | Inf | rootx | slide | position (m) | + + + ## Rewards + The total reward is: ***reward*** *=* *forward_reward - ctrl_cost*. + + - *forward_reward*: + A reward for moving forward, + this reward would be positive if the Half Cheetah moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the "tip" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions which is depends on the `frame_skip` parameter (default is 5), + and `frametime` which is 0.01 - so the default is $dt = 5 \times 0.01 = 0.05$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward to penalize the Half Cheetah for taking actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is the `ctrl_cost_weight` (default is $0.1$). + + `info` contains the individual reward terms. + + + ## Starting State + The initial position state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{9}, reset\_noise\_scale \times 1_{9}]}$. + The initial velocity state is $\mathcal{N}(0_{9}, reset\_noise\_scale^2 \times I_{9})$. + + where $\mathcal{N}$ is the multivariate normal distribution and $\mathcal{U}$ is the multivariate uniform continuous distribution. + + + ## Episode End + #### Termination + The Half Cheetah never terminates. + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + HalfCheetah provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('HalfCheetah-v5', ctrl_cost_weight=0.1, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"half_cheetah.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ weight (see section on reward) | + | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Restored the `xml_file` argument (was removed in `v4`). + - Renamed `info["reward_run"]` to `info["reward_forward"]` to be consistent with the other environments. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen). + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "half_cheetah.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 0.1, + reset_noise_scale: float = 0.1, + exclude_current_positions_from_observation: bool = True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 1 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 1 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + info = { + "x_position": x_position_after, + "x_velocity": x_velocity, + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + } + + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = ( + self.init_qvel + + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv) + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + } diff --git a/gymnasium/envs/mujoco/hopper_v4.py b/gymnasium/envs/mujoco/hopper_v4.py index 17430c9a2..2d32b9b86 100644 --- a/gymnasium/envs/mujoco/hopper_v4.py +++ b/gymnasium/envs/mujoco/hopper_v4.py @@ -14,133 +14,6 @@ class HopperEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the work done by Erez, Tassa, and Todorov in - ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to - increase the number of independent state and control variables as compared to - the classic control environments. The hopper is a two-dimensional - one-legged figure that consist of four main body parts - the torso at the - top, the thigh in the middle, the leg in the bottom, and a single foot on - which the entire body rests. The goal is to make hops that move in the - forward (right) direction by applying torques on the three hinges - connecting the four body parts. - - ## Action Space - The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the - hopper, followed by the velocities of those individual parts - (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the hopper. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be `Box(-Inf, Inf, (12,), float64)` where the first observation - represents the x-coordinate of the hopper. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - will be returned in `info` with key `"x_position"`. - - However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements - correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | - | 6 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | - | 7 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | - - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`. - - *forward_reward*: A reward of hopping forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (fixed to 4), where the frametime is 0.002 - making the - default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper - hops forward (positive x direction). - - *ctrl_cost*: A cost for penalising the hopper if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise - in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ## Episode End - The hopper is said to be unhealthy if any of the following happens: - - 1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range` - 2. The height of the hopper (`observation[0]` if `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen) - 3. The angle (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The hopper is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Hopper-v2') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Hopper-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"hopper.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.001` | Weight for _ctrl_cost_ reward (see section on reward) | - | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the hopper is no longer healthy | - | `healthy_state_range` | **tuple** | `(-100, 100)` | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy | - | `healthy_z_range` | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-0.2, 0.2)` | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy | - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py new file mode 100644 index 000000000..c058c3de4 --- /dev/null +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -0,0 +1,359 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict, Tuple + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 2, + "distance": 3.0, + "lookat": np.array((0.0, 0.0, 1.15)), + "elevation": -20.0, +} + + +class HopperEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the work done by Erez, Tassa, and Todorov in + ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to + increase the number of independent state and control variables as compared to + the classic control environments. The hopper is a two-dimensional + one-legged figure that consist of four main body parts - the torso at the + top, the thigh in the middle, the leg in the bottom, and a single foot on + which the entire body rests. The goal is to make hops that move in the + forward (right) direction by applying torques on the three hinges + connecting the four body parts. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Hopper-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Hopper-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Hopper-v3 | `mujoco-py` | Maintained for reproducibility | + | Hopper-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + ## Action Space + The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + + + ## Observation Space + The observation Space consists of the following parts (in order): + + - qpos (5 elements by default):* Position values of the robots's body parts. + - qvel (6 elements):* The velocities of these individual body parts, + (their derivatives). + + By default, observations do not include the x-coordinate of the hopper. It may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be `Box(-Inf, Inf, (12,), float64)` where the first observation + represents the x-coordinate of the hopper. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate + will be returned in `info` with key `"x_position"`. + + By default, the observation does not include the robot's x-coordinate (`rootx`). + This can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (12,), float64)`, where the first observation element is the x--coordinate of the robot. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements + correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | + | 6 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | + | 7 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | + + + ## Rewards + The total reward is: ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. + + - *healthy_reward*: + Every timestep that the Hopper is healthy (see definition in section "Episode Termination"), + it gets a reward of fixed value `healthy_reward`. + - *forward_reward*: + A reward for moving forward, + this reward would be positive if the Hopper moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the "torso" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 4), + and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward to penalize the Hopper for taking actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). + + `info` contains the individual reward terms. + + + ## Starting State + The initial position state is $[0, 1.25, 0, 0, 0, 0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{6}, reset\_noise\_scale \times 1_{6}]}$. + The initial velocity state is $0_6 + \mathcal{U}_{[-reset\_noise\_scale \times 1_{6}, reset\_noise\_scale \times 1_{6}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z-coordinate is non-zero so that the hopper can stand up immediately. + + + ## Episode End + #### Termination + If `terminate_when_unhealthy is True` (the default), the environment terminates when the Hopper is unhealthy. + The Hopper is unhealthy if any of the following happens: + + 1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range` + 2. The height of the hopper (`observation[0]` if `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen) + 3. The angle of the torso (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range` + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + Hopper provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('Hopper-v5', ctrl_cost_weight=1e-3, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"hopper.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctrl_cost_ reward (see section on reward) | + | `healthy_reward` | **float** | `1` | Weight for _healthy_reward_ reward (see section on reward) | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the hopper is no longer healthy | + | `healthy_state_range` | **tuple** | `(-100, 100)` | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy | + | `healthy_z_range` | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy | + | `healthy_angle_range` | **tuple** | `(-0.2, 0.2)` | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy | + | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Hopper was unhealthy), now it is only given when the Hopper is healthy. The `info["reward_survive"]` is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added individual reward terms in `info` (`info["reward_forward"]`, info`["reward_ctrl"]`, `info["reward_survive"]`). + - Added `info["z_distance_from_origin"]` which is equal to the vertical distance of the "torso" body from its initial position. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0). + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "hopper.xml", + frame_skip: int = 4, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-3, + healthy_reward: float = 1.0, + terminate_when_unhealthy: bool = True, + healthy_state_range: Tuple[float, float] = (-100.0, 100.0), + healthy_z_range: Tuple[float, float] = (0.7, float("inf")), + healthy_angle_range: Tuple[float, float] = (-0.2, 0.2), + reset_noise_scale: float = 5e-3, + exclude_current_positions_from_observation: bool = True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + healthy_reward, + terminate_when_unhealthy, + healthy_state_range, + healthy_z_range, + healthy_angle_range, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_state_range = healthy_state_range + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 1 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 1 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.data.qpos[1:3] + state = self.state_vector()[2:] + + min_state, max_state = self._healthy_state_range + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_state = np.all(np.logical_and(min_state < state, state < max_state)) + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + + is_healthy = all((healthy_state, healthy_z, healthy_angle)) + + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + return terminated + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = np.clip(self.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_survive": healthy_reward, + "x_position": x_position_after, + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + "x_velocity": x_velocity, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + } diff --git a/gymnasium/envs/mujoco/humanoid_v4.py b/gymnasium/envs/mujoco/humanoid_v4.py index 5b3d9df1d..c5a1bc9b7 100644 --- a/gymnasium/envs/mujoco/humanoid_v4.py +++ b/gymnasium/envs/mujoco/humanoid_v4.py @@ -20,249 +20,6 @@ def mass_center(model, data): class HumanoidEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the environment introduced by Tassa, Erez and Todorov - in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). - The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of - legs and arms. The legs each consist of three body parts, and the arms 2 body parts (representing the knees and - elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over. - - ## Action Space - The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| - | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | - | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | - | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | - | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | - | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | - | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | - | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations - represent the x- and y-coordinates of the torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | - | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | - | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | - | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | - | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | - | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | - | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | - | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | - | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | - | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | - | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | - | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | - | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | - | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | - | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | - | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | - | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | - | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - - Additionally, after all the positional and velocity based values in the table, - the observation contains (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) - and hence adds to another 140 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence - adds another 84 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the - number of degrees of freedom (*= dim(qvel)*) - - The body parts are: - - | id (for `v2`,`v3`,`v4`) | body part | - | --- | ------------ | - | 0 | worldBody (note: all values are constant 0) | - | 1 | torso | - | 2 | lwaist | - | 3 | pelvis | - | 4 | right_thigh | - | 5 | right_sin | - | 6 | right_foot | - | 7 | left_thigh | - | 8 | left_sin | - | 9 | left_foot | - | 10 | right_upper_arm | - | 11 | right_lower_arm | - | 12 | left_upper_arm | - | 13 | left_lower_arm | - - The joints are: - - | id (for `v2`,`v3`,`v4`) | joint | - | --- | ------------ | - | 0 | root | - | 1 | root | - | 2 | root | - | 3 | root | - | 4 | root | - | 5 | root | - | 6 | abdomen_z | - | 7 | abdomen_y | - | 8 | abdomen_x | - | 9 | right_hip_x | - | 10 | right_hip_z | - | 11 | right_hip_y | - | 12 | right_knee | - | 13 | left_hip_x | - | 14 | left_hiz_z | - | 15 | left_hip_y | - | 16 | left_knee | - | 17 | right_shoulder1 | - | 18 | right_shoulder2 | - | 19 | right_elbow| - | 20 | left_shoulder1 | - | 21 | left_shoulder2 | - | 22 | left_elfbow | - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the - [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - **Note:** Humanoid-v4 environment no longer has the following contact forces issue. - If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 - results in the contact forces always being 0. As such we recommend to use a Mujoco-Py - version < 2.0 when using the Humanoid environment if you would like to report results - with contact forces (if contact forces are not used in your experiments, you can use - version > 2.0). - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward` - - *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` * - (average center of mass before action - average center of mass after action)/dt*. - *dt* is the time between actions and is dependent on the frame_skip parameter - (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*. - This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation - for the center of mass is defined in the `.py` file for the Humanoid. - - *ctrl_cost*: A negative reward for penalising the humanoid if it has too - large of a control force. If there are *nu* actuators/controls, then the control has - shape `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control2)*. - - *contact_cost*: A negative reward for penalising the humanoid if the external - contact force is too large. It is calculated by clipping - *`contact_cost_weight` * sum(external contact force2)* to the interval specified by `contact_cost_range`. - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally - selected to be high, thereby indicating a standing up humanoid. The initial - orientation is designed to make it face forward as well. - - ## Episode End - The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the - closed interval specified by the argument `healthy_z_range`. - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 3. Termination: The humanoid is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Humanoid-v4') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Humanoid-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"humanoid.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.25` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ term (see section on reward) | - | `contact_cost_weight` | **float** | `5e-7` | Weight for _contact_cost_ term (see section on reward) | - | `healthy_reward` | **float** | `5.0` | Constant reward given if the humanoid is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | - | `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range | - | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py new file mode 100644 index 000000000..778ac23e0 --- /dev/null +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -0,0 +1,548 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict, Tuple + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 1, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 2.0)), + "elevation": -20.0, +} + + +def mass_center(model, data): + mass = np.expand_dims(model.body_mass, axis=1) + xpos = data.xipos + return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() + + +class HumanoidEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the environment introduced by Tassa, Erez and Todorov + in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). + The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of + legs and arms. The legs each consist of three body parts, and the arms 2 body parts (representing the knees and + elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over. + + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Humanoid-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Humanoid-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Humanoid-v3 | `mujoco-py` | Maintained for reproducibility | + | Humanoid-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | + | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | + | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | + | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | + | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | + | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | + | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + + + ## Observation Space + The observation space consists of the following parts (in order) + + - qpos (22 elements by default):* The position values of the robot's body parts. + - qvel (23 elements):* The velocities of these individual body parts, + (their derivatives). + - *cinert (130 elements):* Mass and inertia of the rigid body parts relative to the center of mass, + (this is an intermediate result of the transition). + It has shape 13*10 (*nbody * 10*). + (cinert - inertia matrix and body mass offset and body mass) + - *cvel (78 elements):* Center of mass based velocity. + It has shape 13 * 6 (*nbody * 6*). + (com velocity - velocity x, y, z and angular velocity x, y, z) + - *qfrc_actuator (17 elements):* Constraint force generated as the actuator force at each joint. + This has shape `(17,)` *(nv * 1)*. + - *cfrc_ext (78 elements):* This is the center of mass based external force on the body parts. + It has shape 13 * 6 (*nbody * 6*) and thus adds to another 78 elements in the observation space. + (external forces - force x, y, z and torque x, y, z) + + where *nbody* is for the number of bodies in the robot + and *nv* is for the number of degrees of freedom (*= dim(qvel)*). + + By default, the observation does not include the x- and y-coordinates of the torso. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (350,), float64)`, where the first two observations are the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, however, the observation space is a `Box(-Inf, Inf, (348,), float64)`, where the position and velocity elements are as follows: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | + | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | + | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | + | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | + | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | + | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | + | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | + | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | + | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | + | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | + | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | + | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | + | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | + | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | + | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | + | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | + | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | + | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angular velocity (rad/s) | + | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angular velocity (rad/s) | + | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angular velocity (rad/s) | + | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angular velocity (rad/s) | + | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angular velocity (rad/s) | + | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angular velocity (rad/s) | + | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angular velocity (rad/s) | + | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angular velocity (rad/s) | + | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angular velocity (rad/s) | + | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angular velocity (rad/s) | + | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angular velocity (rad/s) | + | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angular velocity (rad/s) | + | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angular velocity (rad/s) | + | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angular velocity (rad/s) | + | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angular velocity (rad/s) | + | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angular velocity (rad/s) | + | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + + The body parts are: + + | body part | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | worldbody (note: all values are constant 0) | 0 |excluded| + | torso |1 | 0 | + | lwaist |2 | 1 | + | pelvis |3 | 2 | + | right_thigh |4 | 3 | + | right_sin |5 | 4 | + | right_foot |6 | 5 | + | left_thigh |7 | 6 | + | left_sin |8 | 7 | + | left_foot |9 | 8 | + | right_upper_arm |10 | 9 | + | right_lower_arm |11 | 10 | + | left_upper_arm |12 | 11 | + | left_lower_arm |13 | 12 | + + The joints are: + + | joint | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | root (note: all values are constant 0) | 0 |excluded| + | root (note: all values are constant 0) | 1 |excluded| + | root (note: all values are constant 0) | 2 |excluded| + | root (note: all values are constant 0) | 3 |excluded| + | root (note: all values are constant 0) | 4 |excluded| + | root (note: all values are constant 0) | 5 |excluded| + | abdomen_z | 6 | 0 | + | abdomen_y | 7 | 1 | + | abdomen_x | 8 | 2 | + | right_hip_x | 9 | 3 | + | right_hip_z | 10 | 4 | + | right_hip_y | 11 | 5 | + | right_knee | 12 | 6 | + | left_hip_x | 13 | 7 | + | left_hiz_z | 14 | 8 | + | left_hip_y | 15 | 9 | + | left_knee | 16 | 10 | + | right_shoulder1 | 17 | 11 | + | right_shoulder2 | 18 | 12 | + | right_elbow | 19 | 13 | + | left_shoulder1 | 20 | 14 | + | left_shoulder2 | 21 | 15 | + | left_elfbow | 22 | 16 | + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the + [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + **Note:** + When using Humanoid-v3 or earlier versions, problems have been reported when using a Mujoco-Py version > 2.0. + results in the contact forces always being 0. As such we recommend to use a Mujoco-Py + version < 2.0 when using the Humanoid environment if you would like to report results + with contact forces (if contact forces are not used in your experiments, you can use + version > 2.0). + + + ## Rewards + The total reward is: ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. + + - *healthy_reward*: + Every timestep that the Humanoid is alive (see section Episode Termination for definition), + it gets a reward of fixed value `healthy_reward`. + - *forward_reward*: + A reward for moving forward, + this reward would be positive if the Humanoid moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the center of mass ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 5), + and `frametime` which is 0.001 - so the default is $dt = 5 \times 0.003 = 0.015$, + $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). + - *ctrl_cost*: + A negative reward to penalize the Humanoid for taking actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $0.1$). + - *contact_cost*: + A negative reward to penalize the Humanoid if the external contact forces are too large. + $w_{contact} \times clamp(contact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where + $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-7}$), + $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). + + `info` contains the individual reward terms. + + **Note:** There is a bug in the `Humanoid-v4` environment that causes *contact_cost* to always be 0. + + + ## Starting State + The initial position state is $[0.0, 0.0, 1.4, 1.0, 0.0, ... 0.0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{24}, reset\_noise\_scale \times 1_{24}]}$. + The initial velocity state is $0_{23} + \mathcal{U}_{[-reset\_noise\_scale \times 1_{23}, reset\_noise\_scale \times 1_{23}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z- and x-coordinates are non-zero so that the humanoid can immediately stand up and face forward (x-axis). + + + ## Episode End + #### Termination + If `terminate_when_unhealthy is True` (the default), the environment terminates when the Humanoid is unhealthy. + The Humanoid is said to be unhealthy if any of the following happens: + + 1. The z-position of the torso (the height) is no longer contained in `healthy_z_range`. + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + Humanoid provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('Humanoid-v5', ctrl_cost_weight=0.1, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"humanoid.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1.25` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ term (see section on reward) | + | `contact_cost_weight` | **float** | `5e-7` | Weight for _contact_cost_ term (see section on reward) | + | `contact_cost_range` | **float** | `(-np.inf, 10.0) | Clamps the _contact_cost_ term (see section on reward) | + | `healthy_reward` | **float** | `5.0` | Weight for _healthy_reward_ term (see section on reward) | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | + | `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range | + | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + | `include_cinert_in_observation` | **bool** | `True` | Whether to include *cinert* elements in the observations. | + | `include_cvel_in_observation` | **bool** | `True` | Whether to include *cvel* elements in the observations. | + | `include_qfrc_actuator_in_observation` | **bool** | `True` | Whether to include *qfrc_actuator* elements in the observations. | + | `include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Humanoid was unhealthy), now it is only given when the Humanoid is healthy. The `info["reward_survive"]` is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - Restored `contact_cost` and the corresponding `contact_cost_weight` and `contact_cost_range` arguments, with the same defaults as in `Humanoid-v3` (was removed in `v4`) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/504)). + - Excluded the `cinert` & `cvel` & `cfrc_ext` of `worldbody` and `root`/`freejoint` `qfrc_actuator` from the observation space, as it was always 0, and thus provided no useful information to the agent, resulting in slightly faster training) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added `include_cinert_in_observation`, `include_cvel_in_observation`, `include_qfrc_actuator_in_observation`, `include_cfrc_ext_in_observation` arguments to allow for the exclusion of observation elements from the observation space. + - Fixed `info["x_position"]` & `info["y_position"]` & `info["distance_from_origin"]` returning `xpos` instead of `qpos` based observations (`xpos` observations are behind 1 `mj_step()` more [here](https://github.com/deepmind/mujoco/issues/889#issuecomment-1568896388)) (related [Github issue #1](https://github.com/Farama-Foundation/Gymnasium/issues/521) & [Github issue #2](https://github.com/Farama-Foundation/Gymnasium/issues/539)). + - Added `info["tendon_length"]` and `info["tendon_velocity"]` containing observations of the Humanoid's 2 tendons connecting the hips to the knees. + - Renamed `info["reward_alive"]` to `info["reward_survive"]` to be consistent with the other environments. + - Renamed `info["reward_linvel"]` to `info["reward_forward"]` to be consistent with the other environments. + - Renamed `info["reward_quadctrl"]` to `info["reward_ctrl"]` to be consistent with the other environments. + - Removed `info["forward_reward"]` as it is equivalent to `info["reward_forward"]`. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "humanoid.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.25, + ctrl_cost_weight: float = 0.1, + contact_cost_weight: float = 5e-7, + contact_cost_range: Tuple[float, float] = (-np.inf, 10.0), + healthy_reward: float = 5.0, + terminate_when_unhealthy: bool = True, + healthy_z_range: Tuple[float, float] = (1.0, 2.0), + reset_noise_scale: float = 1e-2, + exclude_current_positions_from_observation: bool = True, + include_cinert_in_observation: bool = True, + include_cvel_in_observation: bool = True, + include_qfrc_actuator_in_observation: bool = True, + include_cfrc_ext_in_observation: bool = True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + contact_cost_weight, + contact_cost_range, + healthy_reward, + terminate_when_unhealthy, + healthy_z_range, + reset_noise_scale, + exclude_current_positions_from_observation, + include_cinert_in_observation, + include_cvel_in_observation, + include_qfrc_actuator_in_observation, + include_cfrc_ext_in_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._contact_cost_weight = contact_cost_weight + self._contact_cost_range = contact_cost_range + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + self._include_cinert_in_observation = include_cinert_in_observation + self._include_cvel_in_observation = include_cvel_in_observation + self._include_qfrc_actuator_in_observation = ( + include_qfrc_actuator_in_observation + ) + self._include_cfrc_ext_in_observation = include_cfrc_ext_in_observation + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = self.data.qpos.size + self.data.qvel.size + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += self.data.cinert[1:].size * include_cinert_in_observation + obs_size += self.data.cvel[1:].size * include_cvel_in_observation + obs_size += (self.data.qvel.size - 6) * include_qfrc_actuator_in_observation + obs_size += self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation + + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + "cinert": self.data.cinert[1:].size * include_cinert_in_observation, + "cvel": self.data.cvel[1:].size * include_cvel_in_observation, + "qfrc_actuator": (self.data.qvel.size - 6) + * include_qfrc_actuator_in_observation, + "cfrc_ext": self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation, + "ten_lenght": 0, + "ten_velocity": 0, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl)) + return control_cost + + @property + def contact_cost(self): + contact_forces = self.data.cfrc_ext + contact_cost = self._contact_cost_weight * np.sum(np.square(contact_forces)) + min_cost, max_cost = self._contact_cost_range + contact_cost = np.clip(contact_cost, min_cost, max_cost) + return contact_cost + + @property + def is_healthy(self): + min_z, max_z = self._healthy_z_range + is_healthy = min_z < self.data.qpos[2] < max_z + + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + return terminated + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._include_cinert_in_observation is True: + com_inertia = self.data.cinert[1:].flat.copy() + else: + com_inertia = np.array([]) + if self._include_cvel_in_observation is True: + com_velocity = self.data.cvel[1:].flat.copy() + else: + com_velocity = np.array([]) + + if self._include_qfrc_actuator_in_observation is True: + actuator_forces = self.data.qfrc_actuator[6:].flat.copy() + else: + actuator_forces = np.array([]) + if self._include_cfrc_ext_in_observation is True: + external_contact_forces = self.data.cfrc_ext[1:].flat.copy() + else: + external_contact_forces = np.array([]) + + if self._exclude_current_positions_from_observation: + position = position[2:] + + return np.concatenate( + ( + position, + velocity, + com_inertia, + com_velocity, + actuator_forces, + external_contact_forces, + ) + ) + + def step(self, action): + xy_position_before = mass_center(self.model, self.data) + self.do_simulation(action, self.frame_skip) + xy_position_after = mass_center(self.model, self.data) + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + costs = ctrl_cost + contact_cost + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + + observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated + info = { + "reward_survive": healthy_reward, + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_contact": -contact_cost, + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + } diff --git a/gymnasium/envs/mujoco/humanoidstandup_v4.py b/gymnasium/envs/mujoco/humanoidstandup_v4.py index 40e791c9b..fe3cc9f06 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v4.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v4.py @@ -14,236 +14,6 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the environment introduced by Tassa, Erez and Todorov - in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). - The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a - pair of legs and arms. The legs each consist of two links, and so the arms (representing the - knees and elbows respectively). The environment starts with the humanoid laying on the ground, - and then the goal of the environment is to make the humanoid standup and then keep it standing - by applying torques on the various hinges. - - ## Action Space - The agent take a 17-element vector for actions. - - The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action` - represents the numerical torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | - | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | - | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | - | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | - | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | - | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | - | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | - | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations - represent the x- and y-coordinates of the torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | - | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | - | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | - | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | - | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | - | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | - | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | - | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | - | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | - | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | - | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | - | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | - | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | - | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | - | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | - | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | - | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | - | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - - Additionally, after all the positional and velocity based values in the table, - the observation contains (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) - and hence adds to another 140 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence - adds another 84 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the - number of degrees of freedom (*= dim(qvel)*) - - The body parts are: - - | id (for `v2`,`v3`,`v4`) | body part | - | --- | ------------ | - | 0 | worldBody (note: all values are constant 0) | - | 1 | torso | - | 2 | lwaist | - | 3 | pelvis | - | 4 | right_thigh | - | 5 | right_sin | - | 6 | right_foot | - | 7 | left_thigh | - | 8 | left_sin | - | 9 | left_foot | - | 10 | right_upper_arm | - | 11 | right_lower_arm | - | 12 | left_upper_arm | - | 13 | left_lower_arm | - - The joints are: - - | id (for `v2`,`v3`,`v4`) | joint | - | --- | ------------ | - | 0 | root | - | 1 | root | - | 2 | root | - | 3 | root | - | 4 | root | - | 5 | root | - | 6 | abdomen_z | - | 7 | abdomen_y | - | 8 | abdomen_x | - | 9 | right_hip_x | - | 10 | right_hip_z | - | 11 | right_hip_y | - | 12 | right_knee | - | 13 | left_hip_x | - | 14 | left_hiz_z | - | 15 | left_hip_y | - | 16 | left_knee | - | 17 | right_shoulder1 | - | 18 | right_shoulder2 | - | 19 | right_elbow| - | 20 | left_shoulder1 | - | 21 | left_shoulder2 | - | 22 | left_elfbow | - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the - [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - **Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue. - If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results - in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 - when using the Humanoid environment if you would like to report results with contact forces - (if contact forces are not used in your experiments, you can use version > 2.0). - - ## Rewards - The reward consists of three parts: - - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative - reward which measures how much upward it has moved from the last timestep, but it is an - absolute reward which measures how much upward the Humanoid has moved overall. It is - measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after - action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for - one frame of movement even though the simulation has a framerate of 5 (done in order to inflate - rewards a little for faster learning). - - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of - a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`. - It is measured as *0.1 **x** sum(control2)*. - - *quad_impact_cost*: A negative reward for penalising the humanoid if the external - contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external - contact force2), 10)*. - - The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost* - - ## Starting State - All observations start in state - (0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of - [-0.01, 0.01] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally selected - to be low, thereby indicating a laying down humanoid. The initial orientation is - designed to make it face forward as well. - - ## Episode End - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: Any of the state space values is no longer finite - - ## Arguments - - No additional arguments are currently supported. - - ```python - import gymnasium as gym - env = gym.make('HumanoidStandup-v4') - ``` - - There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and - beyond take gymnasium.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ```python - import gymnasium as gym - env = gym.make('HumanoidStandup-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py new file mode 100644 index 000000000..fe6330560 --- /dev/null +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -0,0 +1,491 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict, Tuple + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 1, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 0.8925)), + "elevation": -20.0, +} + + +class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the environment introduced by Tassa, Erez and Todorov + in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). + The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a + pair of legs and arms. The legs each consist of two links, and so the arms (representing the + knees and elbows respectively). The environment starts with the humanoid laying on the ground, + and then the goal of the environment is to make the humanoid standup and then keep it standing + by applying torques on the various hinges. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | HumanoidStandup-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | HumanoidStandup-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | HumanoidStandup-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | + | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | + | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | + | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | + | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | + | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | + | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + + + ## Observation Space + The observation space consists of the following parts (in order) + + - qpos (22 elements by default):* The position values of the robot's body parts. + - qvel (23 elements):* The velocities of these individual body parts, + (their derivatives). + - *cinert (130 elements):* Mass and inertia of the rigid body parts relative to the center of mass, + (this is an intermediate result of the transition). + It has shape 13*10 (*nbody * 10*). + (cinert - inertia matrix and body mass offset and body mass) + - *cvel (78 elements):* Center of mass based velocity. + It has shape 13 * 6 (*nbody * 6*). + (com velocity - velocity x, y, z and angular velocity x, y, z) + - *qfrc_actuator (17 elements):* Constraint force generated as the actuator force at each joint. + This has shape `(17,)` *(nv * 1)*. + - *cfrc_ext (78 elements):* This is the center of mass based external force on the body parts. + It has shape 13 * 6 (*nbody * 6*) and thus adds to another 78 elements in the observation space. + (external forces - force x, y, z and torque x, y, z) + + where *nbody* is for the number of bodies in the robot + and *nv* is for the number of degrees of freedom (*= dim(qvel)*). + + By default, the observation does not include the x- and y-coordinates of the torso. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (350,), float64)`, where the first two observations are the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, however, the observation space is a `Box(-Inf, Inf, (348,), float64)`, where the position and velocity elements are as follows: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | + | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | + | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | + | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | + | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | + | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | + | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | + | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | + | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | + | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | + | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | + | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | + | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | + | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | + | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | + | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | + | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | + | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angular velocity (rad/s) | + | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angular velocity (rad/s) | + | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angular velocity (rad/s) | + | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angular velocity (rad/s) | + | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angular velocity (rad/s) | + | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angular velocity (rad/s) | + | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angular velocity (rad/s) | + | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angular velocity (rad/s) | + | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angular velocity (rad/s) | + | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angular velocity (rad/s) | + | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angular velocity (rad/s) | + | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angular velocity (rad/s) | + | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angular velocity (rad/s) | + | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angular velocity (rad/s) | + | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angular velocity (rad/s) | + | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angular velocity (rad/s) | + | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + + The body parts are: + + | body part | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | worldbody (note: all values are constant 0) | 0 |excluded| + | torso |1 | 0 | + | lwaist |2 | 1 | + | pelvis |3 | 2 | + | right_thigh |4 | 3 | + | right_sin |5 | 4 | + | right_foot |6 | 5 | + | left_thigh |7 | 6 | + | left_sin |8 | 7 | + | left_foot |9 | 8 | + | right_upper_arm |10 | 9 | + | right_lower_arm |11 | 10 | + | left_upper_arm |12 | 11 | + | left_lower_arm |13 | 12 | + + The joints are: + + | joint | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | root (note: all values are constant 0) | 0 |excluded| + | root (note: all values are constant 0) | 1 |excluded| + | root (note: all values are constant 0) | 2 |excluded| + | root (note: all values are constant 0) | 3 |excluded| + | root (note: all values are constant 0) | 4 |excluded| + | root (note: all values are constant 0) | 5 |excluded| + | abdomen_z | 6 | 0 | + | abdomen_y | 7 | 1 | + | abdomen_x | 8 | 2 | + | right_hip_x | 9 | 3 | + | right_hip_z | 10 | 4 | + | right_hip_y | 11 | 5 | + | right_knee | 12 | 6 | + | left_hip_x | 13 | 7 | + | left_hiz_z | 14 | 8 | + | left_hip_y | 15 | 9 | + | left_knee | 16 | 10 | + | right_shoulder1 | 17 | 11 | + | right_shoulder2 | 18 | 12 | + | right_elbow | 19 | 13 | + | left_shoulder1 | 20 | 14 | + | left_shoulder2 | 21 | 15 | + | left_elfbow | 22 | 16 | + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the + [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + **Note:** + When using Humanoid-v3 or earlier versions, problems have been reported when using a Mujoco-Py version > 2.0. + results in the contact forces always being 0. As such we recommend to use a Mujoco-Py + version < 2.0 when using the Humanoid environment if you would like to report results + with contact forces (if contact forces are not used in your experiments, you can use + version > 2.0). + + + ## Rewards + The total reward is: ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*. + + - *uph_cost*: + A reward for moving up (trying to stand up). + This is not a relative reward, measuring how far up it has moved since the last timestep, + but it is an absolute reward that measures how much upward the Humanoid has moved up in total. + It is measured as $weight_{uph} \times (z_{after action} - 0)/dt$, + where $z_{after action}$ is the z coordinate of the torso after taking an action, + and *dt* is the time between actions, and depends on the `frame_skip` parameter + (default is 5), where the frametime is 0.003 - so the default is *dt = 5 * 0.003 = 0.015*. + and $weight_{uph}$ is `uph_cost_weight`. + - *quad_ctrl_cost*: + A negative reward to penalize the Humanoid for taking actions that are too large. + $w_{quad_control} \times \\|action\\|_2^2$, + where $w_{quad_control}$ is `ctrl_cost_weight` (default is $0.1$). + If there are *nu* actuators/controls, then the control has shape `nu x 1`. + - *impact_cost*: + A negative reward to penalize the Humanoid if the external contact forces are too large. + $w_{impact} \times clamp(impact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where + $w_{impact}$ is `impact_cost_weight` (default is $5\times10^{-7}$), + $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). + + `info` contains the individual reward terms. + + + ## Starting State + The initial position state is $[0.0, 0.0, 1.4, 1.0, 0.0, ... 0.0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{24}, reset\_noise\_scale \times 1_{24}]}$. + The initial velocity state is $0_{23} + \mathcal{U}_{[-reset\_noise\_scale \times 1_{23}, reset\_noise\_scale \times 1_{23}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z- and x-coordinates are non-zero so that the humanoid immediately lies down and faces forward (x-axis). + + + ## Episode End + #### Termination + The Humanoid never terminates. + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + HumanoidStandup provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('HumanoidStandup-v5', impact_cost_weight=0.5e-6, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"humanoidstandup.xml"` | Path to a MuJoCo model | + | `uph_cost_weight` | **float** | `1` | Weight for _uph_cost_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `0.1` | Weight for _quad_ctrl_cost_ term (see section on reward) | + | `impact_cost_weight` | **float** | `0.5e-6` | Weight for _impact_cost_ term (see section on reward) | + | `impact_cost_range` | **float** | `(-np.inf, 10.0) | Clamps the _impact_cost_ | + | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + | `include_cinert_in_observation` | **bool** | `True` | Whether to include *cinert* elements in the observations. | + | `include_cvel_in_observation` | **bool** | `True` | Whether to include *cvel* elements in the observations. | + | `include_qfrc_actuator_in_observation` | **bool** | `True` | Whether to include *qfrc_actuator* elements in the observations. | + | `include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Excluded the `cinert` & `cvel` & `cfrc_ext` of `worldbody` and `root`/`freejoint` `qfrc_actuator` from the observation space, as it was always 0, and thus provided no useful information to the agent, resulting in slightly faster training) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added `xml_file` argument. + - Added `uph_cost_weight`, `ctrl_cost_weight`, `impact_cost_weight`, `impact_cost_range` arguments, to configure the reward function (defaults are effectively the same as in `v4`). + - Added `reset_noise_scale` argument, to set the range of initial states. + - Added `include_cinert_in_observation`, `include_cvel_in_observation`, `include_qfrc_actuator_in_observation`, `include_cfrc_ext_in_observation` arguments to allow for the exclusion of observation elements from the observation space. + - Added `info["tendon_length"]` and `info["tendon_velocity"]` containing observations of the Humanoid's 2 tendons connecting the hips to the knees. + - Added `info["x_position"]` & `info["y_position"]` , which contain the observations excluded when `exclude_current_positions_from_observation == True`. + - Added `info["z_distance_from_origin"]` which is equal to the vertical distance of the "torso" body from its initial position. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0). + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "humanoidstandup.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + uph_cost_weight: float = 1, + ctrl_cost_weight: float = 0.1, + impact_cost_weight: float = 0.5e-6, + impact_cost_range: Tuple[float, float] = (-np.inf, 10.0), + reset_noise_scale: float = 1e-2, + exclude_current_positions_from_observation: bool = True, + include_cinert_in_observation: bool = True, + include_cvel_in_observation: bool = True, + include_qfrc_actuator_in_observation: bool = True, + include_cfrc_ext_in_observation: bool = True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + uph_cost_weight, + ctrl_cost_weight, + impact_cost_weight, + impact_cost_range, + reset_noise_scale, + exclude_current_positions_from_observation, + include_cinert_in_observation, + include_cvel_in_observation, + include_qfrc_actuator_in_observation, + include_cfrc_ext_in_observation, + **kwargs, + ) + + self._uph_cost_weight = uph_cost_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._impact_cost_weight = impact_cost_weight + self._impact_cost_range = impact_cost_range + self._reset_noise_scale = reset_noise_scale + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + self._include_cinert_in_observation = include_cinert_in_observation + self._include_cvel_in_observation = include_cvel_in_observation + self._include_qfrc_actuator_in_observation = ( + include_qfrc_actuator_in_observation + ) + self._include_cfrc_ext_in_observation = include_cfrc_ext_in_observation + + obs_size = 47 + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += 130 * include_cinert_in_observation + obs_size += 78 * include_cvel_in_observation + obs_size += 17 * include_qfrc_actuator_in_observation + obs_size += 78 * include_cfrc_ext_in_observation + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = self.data.qpos.size + self.data.qvel.size + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += self.data.cinert[1:].size * include_cinert_in_observation + obs_size += self.data.cvel[1:].size * include_cvel_in_observation + obs_size += (self.data.qvel.size - 6) * include_qfrc_actuator_in_observation + obs_size += self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation + + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + "cinert": self.data.cinert[1:].size * include_cinert_in_observation, + "cvel": self.data.cvel[1:].size * include_cvel_in_observation, + "qfrc_actuator": (self.data.qvel.size - 6) + * include_qfrc_actuator_in_observation, + "cfrc_ext": self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation, + "ten_lenght": 0, + "ten_velocity": 0, + } + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._include_cinert_in_observation is True: + com_inertia = self.data.cinert[1:].flat.copy() + else: + com_inertia = np.array([]) + if self._include_cvel_in_observation is True: + com_velocity = self.data.cvel[1:].flat.copy() + else: + com_velocity = np.array([]) + + if self._include_qfrc_actuator_in_observation is True: + actuator_forces = self.data.qfrc_actuator[6:].flat.copy() + else: + actuator_forces = np.array([]) + if self._include_cfrc_ext_in_observation is True: + external_contact_forces = self.data.cfrc_ext[1:].flat.copy() + else: + external_contact_forces = np.array([]) + + if self._exclude_current_positions_from_observation: + position = position[2:] + + return np.concatenate( + ( + position, + velocity, + com_inertia, + com_velocity, + actuator_forces, + external_contact_forces, + ) + ) + + def step(self, action): + self.do_simulation(action, self.frame_skip) + pos_after = self.data.qpos[2] + + uph_cost = (pos_after - 0) / self.model.opt.timestep + + quad_ctrl_cost = self._ctrl_cost_weight * np.square(self.data.ctrl).sum() + + quad_impact_cost = ( + self._impact_cost_weight * np.square(self.data.cfrc_ext).sum() + ) + min_impact_cost, max_impact_cost = self._impact_cost_range + quad_impact_cost = np.clip(quad_impact_cost, min_impact_cost, max_impact_cost) + + reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1 + info = { + "reward_linup": uph_cost, + "reward_quadctrl": -quad_ctrl_cost, + "reward_impact": -quad_impact_cost, + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + } + + if self.render_mode == "human": + self.render() + return self._get_obs(), reward, False, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + } diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py index 7215683fa..2a9f9345d 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py @@ -13,117 +13,6 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment originates from control theory and builds on the cartpole - environment based on the work done by Barto, Sutton, and Anderson in - ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), - powered by the Mujoco physics simulator - allowing for more complex experiments - (such as varying the effects of gravity or constraints). This environment involves a cart that can - moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one - (leaving the second pole as the only one with one free end). The cart can be pushed left or right, - and the goal is to balance the second pole on top of the first pole, which is in turn on top of the - cart, by applying continuous forces on the cart. - - ## Action Space - The agent take a 1-element vector for actions. - The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the - numerical force applied to the cart (with magnitude representing the amount of force and - sign representing the direction) - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| - | 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) | - - ## Observation Space - - The state space consists of positional values of different body parts of the pendulum system, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ----------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | - | 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless | - | 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless | - | 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless | - | 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless | - | 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | - | 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | - | 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) | - | 8 | constraint force - 1 | -Inf | Inf | | | Force (N) | - | 9 | constraint force - 2 | -Inf | Inf | | | Force (N) | - | 10 | constraint force - 3 | -Inf | Inf | | | Force (N) | - - - There is physical contact between the robots and their environment - and Mujoco - attempts at getting realistic physics simulations for the possible physical contact - dynamics by aiming for physical accuracy and computational efficiency. - - There is one constraint force for contacts for each degree of freedom (3). - The approach and handling of constraints by Mujoco is unique to the simulator - and is based on their research. Once can find more information in their - [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html) - or in their paper - ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf). - - - ## Rewards - - The reward consists of two parts: - - *alive_bonus*: The goal is to make the second inverted pendulum stand upright - (within a certain angle limit) as long as possible - as such a reward of +10 is awarded - for each timestep that the second pole is upright. - - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum - (the only free end) moves, and it is calculated as - *0.01 * x2 + (y - 2)2*, where *x* is the x-coordinate of the tip - and *y* is the y-coordinate of the tip of the second pole. - - *velocity_penalty*: A negative reward for penalising the agent if it moves too - fast *0.001 * v12 + 0.005 * v2 2* - - The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty* - - ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range - of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard - normal force with a standard deviation of 0.1 added to the velocity values for stochasticity. - - ## Episode End - The episode ends when any of the following happens: - - 1.Truncation: The episode duration reaches 1000 timesteps. - 2.Termination: Any of the state space values is no longer finite. - 3.Termination: The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other). - - ## Arguments - - No additional arguments are currently supported. - - ```python - import gymnasium as gym - env = gym.make('InvertedDoublePendulum-v4') - ``` - There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and - beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('InvertedDoublePendulum-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py new file mode 100644 index 000000000..dfa190ca2 --- /dev/null +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -0,0 +1,242 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 0, + "distance": 4.1225, + "lookat": np.array((0.0, 0.0, 0.12250000000000005)), +} + + +class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment originates from control theory and builds on the cartpole + environment based on the work done by Barto, Sutton, and Anderson in + ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), + powered by the Mujoco physics simulator - allowing for more complex experiments + (such as varying the effects of gravity or constraints). This environment involves a cart that can + moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one + (leaving the second pole as the only one with one free end). The cart can be pushed left or right, + and the goal is to balance the second pole on top of the first pole, which is in turn on top of the + cart, by applying continuous forces on the cart. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | InvertedDoublePendulum-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | InvertedDoublePendulum-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | InvertedDoublePendulum-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + ## Action Space + The agent take a 1-element vector for actions. + The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the + numerical force applied to the cart (with magnitude representing the amount of force and + sign representing the direction) + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint |Type (Unit)| + |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| + | 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) | + + + ## Observation Space + The state space consists of positional values of different body parts of the pendulum system, + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(9,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ----------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | + | 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless | + | 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless | + | 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless | + | 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless | + | 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | + | 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | + | 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) | + | 8 | constraint force - x | -Inf | Inf | slider | slide | Force (N) | + | excluded | constraint force - y | -Inf | Inf | hinge | slide | Force (N) | + | excluded | constraint force - z | -Inf | Inf | hinge2 | slide | Force (N) | + + + There is physical contact between the robots and their environment - and Mujoco + attempts at getting realistic physics simulations for the possible physical contact + dynamics by aiming for physical accuracy and computational efficiency. + + There is one constraint force for contacts for each degree of freedom (3). + The approach and handling of constraints by Mujoco is unique to the simulator and is based on their research. + Once can find more information in their [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html) + or in their paper ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf). + + + ## Rewards + The total reward is: ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*. + + - *alive_bonus*: + The goal is to keep the second inverted pendulum upright (within a certain angle limit) as long as possible - + so for each timestep that the second pole is upright, a reward of `healthy_reward` is given. + - *distance_penalty*: + This reward is a measure of how far the *tip* of the second pendulum (the only free end) moves, + and it is calculated as $0.01 x_{pole2-tip}^2 + (y_{pole2-tip}-2)^2$, + where $x_{pole2-tip}, y_{pole2-tip}$ are the xy-coordinatesof the tip of the second pole. + - *velocity_penalty*: + A negative reward to penalize the agent for moving too fast. + $10^{-3} \omega_1 + 5 10^{-3} \omega_2$, + where $\omega_1, \omega_2$ are the angular velocities of the hinges. + + `info` contains the individual reward terms. + + + ## Starting State + The initial position state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{3}, reset\_noise\_scale \times 1_{3}]}$. + The initial velocity state is $\mathcal{N}(0_{3}, reset\_noise\_scale^2 \times I_{3})$. + + where $\mathcal{N}$ is the multivariate normal distribution and $\mathcal{U}$ is the multivariate uniform continuous distribution. + + + ## Episode End + #### Termination + The environment terminates when the Inverted Double Pendulum is unhealthy. + The Inverted Double Pendulum is unhealthy if any of the following happens: + + 1.Termination: The y_coordinate of the tip of the second pole $\leq 1$. + + Note: The maximum standing height of the system is 1.2 m when all the parts are perpendicularly vertical on top of each other. + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + InvertedDoublePendulum provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('InvertedDoublePendulum-v5', healthy_reward=10, ...) + ``` + + | Parameter | Type | Default | Description | + |-------------------------|------------|-------------- |-------------------------------| + | `xml_file` | **str** |`"inverted_double_pendulum.xml"`| Path to a MuJoCo model | + | `healthy_reward` | **float** | `10 | Constant reward given if the pendulum is "healthy" (upright) | + | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Pendulum is unhealthy), now it is only given if the DoublePendulum is healthy (not terminated)(related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/500)). + - Excluded the `qfrc_constraint` ("constraint force") of the hinges from the observation space (as it was always 0, thus providing no useful information to the agent, resulting is slightly faster training) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/228)). + - Added `xml_file` argument. + - Added `reset_noise_scale` argument, to set the range of initial states. + - Added `healthy_reward` argument to configure the reward function (defaults are effectively the same as in `v4`). + - Added individual reward terms in `info` ( `info["reward_survive"]`, `info["distance_penalty"]`, `info["velocity_penalty"]`). + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "inverted_double_pendulum.xml", + frame_skip: int = 5, + healthy_reward: float = 10.0, + reset_noise_scale: float = 0.1, + **kwargs, + ): + utils.EzPickle.__init__(self, xml_file, frame_skip, reset_noise_scale, **kwargs) + + self._healthy_reward = healthy_reward + self._reset_noise_scale = reset_noise_scale + + observation_space = Box(low=-np.inf, high=np.inf, shape=(9,), dtype=np.float64) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + def step(self, action): + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + + x, _, y = self.data.site_xpos[0] + v1, v2 = self.data.qvel[1:3] + + terminated = bool(y <= 1) + + dist_penalty = 0.01 * x**2 + (y - 2) ** 2 + vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 + alive_bonus = self._healthy_reward * int(not terminated) + reward = alive_bonus - dist_penalty - vel_penalty + + info = { + "reward_survive": alive_bonus, + "distance_penalty": -dist_penalty, + "velocity_penalty": -vel_penalty, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def _get_obs(self): + return np.concatenate( + [ + self.data.qpos[:1], # cart x pos + np.sin(self.data.qpos[1:]), # link angles + np.cos(self.data.qpos[1:]), + np.clip(self.data.qvel, -10, 10), + np.clip(self.data.qfrc_constraint, -10, 10)[:1], + ] + ).ravel() + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + self.set_state( + self.init_qpos + + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ), + self.init_qvel + + self.np_random.standard_normal(self.model.nv) * self._reset_noise_scale, + ) + return self._get_obs() diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v4.py b/gymnasium/envs/mujoco/inverted_pendulum_v4.py index e935398be..72f6a9c88 100644 --- a/gymnasium/envs/mujoco/inverted_pendulum_v4.py +++ b/gymnasium/envs/mujoco/inverted_pendulum_v4.py @@ -12,87 +12,6 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is the cartpole environment based on the work done by - Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can - solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), - just like in the classic environments but now powered by the Mujoco physics simulator - - allowing for more complex experiments (such as varying the effects of gravity). - This environment involves a cart that can moved linearly, with a pole fixed on it - at one end and having another end free. The cart can be pushed left or right, and the - goal is to balance the pole on the top of the cart by applying forces on the cart. - - ## Action Space - The agent take a 1-element vector for actions. - - The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents - the numerical force applied to the cart (with magnitude representing the amount of - force and sign representing the direction) - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| - | 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) | - - ## Observation Space - - The state space consists of positional values of different body parts of - the pendulum system, followed by the velocities of those individual parts (their derivatives) - with all the positions ordered before all the velocities. - - The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------- | - | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | - | 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) | - | 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | - | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) | - - - ## Rewards - - The goal is to make the inverted pendulum stand upright (within a certain angle limit) - as long as possible - as such a reward of +1 is awarded for each timestep that - the pole is upright. - - ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range - of [-0.01, 0.01] added to the values for stochasticity. - - ## Episode End - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches 1000 timesteps. - 2. Termination: Any of the state space values is no longer finite. - 3. Termination: The absolute value of the vertical angle between the pole and the cart is greater than 0.2 radian. - - ## Arguments - - No additional arguments are currently supported. - - ```python - import gymnasium as gym - env = gym.make('InvertedPendulum-v4') - ``` - There is no v3 for InvertedPendulum, unlike the robot environments where a - v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - ```python - import gymnasium as gym - env = gym.make('InvertedPendulum-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_pendulum_v5.py new file mode 100644 index 000000000..6d519c90b --- /dev/null +++ b/gymnasium/envs/mujoco/inverted_pendulum_v5.py @@ -0,0 +1,205 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 0, + "distance": 2.04, +} + + +class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): + """ + ## Description + This environment is the cartpole environment based on the work done by + Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can + solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), + just like in the classic environments but now powered by the Mujoco physics simulator - + allowing for more complex experiments (such as varying the effects of gravity). + This environment involves a cart that can moved linearly, with a pole fixed on it + at one end and having another end free. The cart can be pushed left or right, and the + goal is to balance the pole on the top of the cart by applying forces on the cart. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | InvertedPendulum-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | InvertedPendulum-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | InvertedPendulum-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The agent take a 1-element vector for actions. + + The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents + the numerical force applied to the cart (with magnitude representing the amount of + force and sign representing the direction) + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint |Type (Unit)| + |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| + | 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) | + + + ## Observation Space + The state space consists of positional values of different body parts of + the pendulum system, followed by the velocities of those individual parts (their derivatives) + with all the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | --------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------- | + | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | + | 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) | + | 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | + | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | + + + ## Rewards + The goal is to keep the inverted pendulum stand upright (within a certain angle limit) + for as long as possible - as such a reward of +1 is awarded for each timestep that + the pole is upright. + + The pole is considered upright if: + $|angle| < 0.2$. + + and `info` also contains the reward. + + + ## Starting State + The initial position state is $\\mathcal{U}_{[-reset\\_noise\\_scale \times 1_{2}, reset\\_noise\\_scale \times 1_{2}]}$. + The initial velocity state is $\\mathcal{U}_{[-reset\\_noise\\_scale \times 1_{2}, reset\\_noise\\_scale \times 1_{2}]}$. + + where $\\mathcal{U}$ is the multivariate uniform continuous distribution. + + All observations start in state + (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range + of `[-reset_noise_scale, reset_noise_scale]` added to the values for stochasticity. + + + ## Episode End + #### Termination + The environment terminates when the Inverted Pendulum is unhealthy. + The Inverted Pendulum is unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite. + 2. The absolute value of the vertical angle between the pole and the cart is greater than 0.2 radian. + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + InvertedPendulum provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('InvertedPendulum-v5', reset_noise_scale=0.1) + ``` + + | Parameter | Type | Default |Description | + |-------------------------|------------|--------------|-------------------------------| + | `xml_file` | **str** | `"inverted_double_pendulum.xml"` | Path to a MuJoCo model | + | `reset_noise_scale` | **float** | `0.01` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Pendulum is unhealthy), now it is only given if the Pendulum is healthy (not terminated) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/500)). + - Added `xml_file` argument. + - Added `reset_noise_scale` argument to set the range of initial states. + - Added `info["reward_survive"]` which contains the reward. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.. + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.5. + * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum). + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "inverted_pendulum.xml", + frame_skip: int = 2, + reset_noise_scale: float = 0.01, + **kwargs, + ): + utils.EzPickle.__init__(self, xml_file, frame_skip, reset_noise_scale, **kwargs) + observation_space = Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float64) + + self._reset_noise_scale = reset_noise_scale + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + self.observation_structure = { + "qpos": self.data.qpos.size, + "qvel": self.data.qvel.size, + } + + def step(self, action): + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + + terminated = bool( + not np.isfinite(observation).all() or (np.abs(observation[1]) > 0.2) + ) + + reward = int(not terminated) + + info = {"reward_survive": reward} + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + size=self.model.nq, low=noise_low, high=noise_high + ) + qvel = self.init_qvel + self.np_random.uniform( + size=self.model.nv, low=noise_low, high=noise_high + ) + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate([self.data.qpos, self.data.qvel]).ravel() diff --git a/gymnasium/envs/mujoco/pusher_v4.py b/gymnasium/envs/mujoco/pusher_v4.py index 55ba517e0..651c6325c 100644 --- a/gymnasium/envs/mujoco/pusher_v4.py +++ b/gymnasium/envs/mujoco/pusher_v4.py @@ -12,133 +12,6 @@ class PusherEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - "Pusher" is a multi-jointed robot arm which is very similar to that of a human. - The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*). - The robot consists of shoulder, elbow, forearm, and wrist joints. - - ## Action Space - The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | - | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | - | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | - | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | - | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | - | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | - | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | - - ## Observation Space - - Observations consist of - - - Angle of rotational joints on the pusher - - Angular velocities of rotational joints on the pusher - - The coordinates of the fingertip of the pusher - - The coordinates of the object to be moved - - The coordinates of the goal position - - The observation is a `Box(-Inf, Inf, (23,), float64)` where the elements correspond to the table below. - An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the - same as human joints. - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------------- | ---- | --- | -------------------------------- | -------- | ------------------------ | - | 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) | - | 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) | - | 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) | - | 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) | - | 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) | - | 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) | - | 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) | - | 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) | - | 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) | - | 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) | - | 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) | - | 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) | - | 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) | - | 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) | - | 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) | - | 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) | - | 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) | - | 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) | - | 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) | - | 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) | - - - ## Rewards - The reward consists of two parts: - - *reward_near *: This reward is a measure of how far the *fingertip* - of the pusher (the unattached end) is from the object, with a more negative - value assigned for when the pusher's *fingertip* is further away from the - target. It is calculated as the negative vector norm of (position of - the fingertip - position of target), or *-norm("fingertip" - "target")*. - - *reward_dist *: This reward is a measure of how far the object is from - the target goal position, with a more negative value assigned for object is - further away from the target. It is calculated as the negative vector norm of - (position of the object - position of goal), or *-norm("object" - "target")*. - - *reward_control*: A negative reward for penalising the pusher if - it takes actions that are too large. It is measured as the negative squared - Euclidean norm of the action, i.e. as *- sum(action2)*. - - The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near* - - Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms. - However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, - you should create a wrapper that computes the weighted reward from `info`. - - - ## Starting State - All pusher (not including object and goal) states start in - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range - [-0.005, 0.005] is added to the velocity attributes only. The velocities of - the object and goal are permanently set to 0. The object's x-position is selected uniformly - between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this - process is repeated until the vector norm between the object's (x,y) position and origin is not greater - than 0.17. The goal always have the same position of (0.45, -0.05, -0.323). - - The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05* - - ## Episode End - - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 100 timesteps. - 2. Termination: Any of the state space values is no longer finite. - - ## Arguments - - No additional arguments are currently supported (in v2 and lower), - but modifications can be made to the XML file in the assets folder - (or by changing the path to a modified XML file in another folder).. - - ```python - import gymnasium as gym - env = gym.make('Pusher-v4') - ``` - - There is no v3 for Pusher, unlike the robot environments where a v3 and - beyond take `gymnasmium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Pusher-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py new file mode 100644 index 000000000..c1be462ce --- /dev/null +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -0,0 +1,271 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": -1, + "distance": 4.0, +} + + +class PusherEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + "Pusher" is a multi-jointed robot arm which is very similar to that of a human. + The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*). + The robot consists of shoulder, elbow, forearm, and wrist joints. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Pusher-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Pusher-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Pusher-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | + | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | + | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | + | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | + | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | + | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | + | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | + + + ## Observation Space + Observations consist of + + - Angle of rotational joints on the pusher + - Angular velocities of rotational joints on the pusher + - The coordinates of the fingertip of the pusher + - The coordinates of the object to be moved + - The coordinates of the goal position + + The observation is a `Box(-Inf, Inf, (23,), float64)` where the elements correspond to the table below. + An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the + same as human joints. + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | -------------------------------------------------------- | ---- | --- | -------------------------------- | -------- | ------------------------ | + | 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) | + | 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) | + | 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) | + | 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) | + | 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) | + | 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) | + | 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) | + | 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) | + | 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) | + | 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) | + | 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) | + | 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) | + | 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) | + | 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) | + | 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) | + | 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) | + | 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) | + | 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) | + | 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) | + | 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) | + + + ## Rewards + The total reward is: ***reward*** *=* *reward_dist + reward_ctrl + reward_near*. + + - *reward_near*: + This reward is a measure of how far the *fingertip* of the pusher (the unattached end) is from the object, + with a more negative value assigned for when the pusher's *fingertip* is further away from the target. + It is $-w_{near} \|(P_{fingertip} - P_{target})\|_2$. + where $w_{near}$ is the `reward_near_weight`. + - *reward_dist*: + This reward is a measure of how far the object is from the target goal position, + with a more negative value assigned if the object that is further away from the target. + It is $-w_{dist} \|(P_{object} - P_{target})\|_2$. + where $w_{dist}$ is the `reward_dist_weight`. + - *reward_control*: + A negative reward to penalize the pusher for taking actions that are too large. + It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. + where $w_{control}$ is the `reward_control_weight`. + + `info` contains the individual reward terms. + + + ## Starting State + The initial position state of the Pusher arm is $0_{6}$. + The initial position state of the object is $\mathcal{U}_{[[-0.3, -0.2], [0, 0.2]]}$. + The position state of the goal is (permanently) $[0.45, -0.05, -0.323]$. + The initial velocity state of the Pusher arm is $\mathcal{U}_{[-0.005 \times 1_{6}, 0.005 \times 1_{6}]}$. + The initial velocity state of the object is $0_2$. + The velocity state of the goal is (permanently) $0_3$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the initial position state of the object is sampled until it's distance to the goal is $ > 0.17 m$. + + The default frame rate is 5, with each frame lasting for 0.01, so *dt = 5 * 0.01 = 0.05*. + + + ## Episode End + #### Termination + The Pusher never terminates. + + #### Truncation + The default duration of an episode is 100 timesteps + + + ## Arguments + Pusher provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('Pusher-v5', xml_file=...) + ``` + + | Parameter | Type | Default |Description | + |-------------------------|------------|--------------|----------------------------------------------------------| + | `xml_file` | **str** |`"pusher.xml"`| Path to a MuJoCo model | + | `reward_near_weight` | **float** | `0.5` | Weight for *reward_near* term (see section on reward) | + | `reward_dist_weight` | **float** | `1` | Weight for *reward_dist* term (see section on reward) | + | `reward_control_weight` | **float** | `0.1` | Weight for *reward_control* term (see section on reward) | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Added `xml_file` argument. + - Added `reward_near_weight`, `reward_dist_weight`, `reward_control_weight` arguments, to configure the reward function (defaults are effectively the same as in `v4`). + - Fixed `info["reward_ctrl"]` being not being multiplied by the reward weight. + - Added `info["reward_near"]` which is equal to the reward term `reward_near`. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks (not including pusher, which has a max_time_steps of 100). Added reward_threshold to environments. + * v0: Initial versions release (1.0.0). + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "pusher.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + reward_near_weight: float = 0.5, + reward_dist_weight: float = 1, + reward_control_weight: float = 0.1, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + reward_near_weight, + reward_dist_weight, + reward_control_weight, + **kwargs, + ) + self._reward_near_weight = reward_near_weight + self._reward_dist_weight = reward_dist_weight + self._reward_control_weight = reward_control_weight + + observation_space = Box(low=-np.inf, high=np.inf, shape=(23,), dtype=np.float64) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + def step(self, action): + vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm") + vec_2 = self.get_body_com("object") - self.get_body_com("goal") + + reward_near = -np.linalg.norm(vec_1) * self._reward_near_weight + reward_dist = -np.linalg.norm(vec_2) * self._reward_dist_weight + reward_ctrl = -np.square(action).sum() * self._reward_control_weight + + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + reward = reward_dist + reward_ctrl + reward_near + info = { + "reward_dist": reward_dist, + "reward_ctrl": reward_ctrl, + "reward_near": reward_near, + } + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def reset_model(self): + qpos = self.init_qpos + + self.goal_pos = np.asarray([0, 0]) + while True: + self.cylinder_pos = np.concatenate( + [ + self.np_random.uniform(low=-0.3, high=0, size=1), + self.np_random.uniform(low=-0.2, high=0.2, size=1), + ] + ) + if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17: + break + + qpos[-4:-2] = self.cylinder_pos + qpos[-2:] = self.goal_pos + qvel = self.init_qvel + self.np_random.uniform( + low=-0.005, high=0.005, size=self.model.nv + ) + qvel[-4:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate( + [ + self.data.qpos.flat[:7], + self.data.qvel.flat[:7], + self.get_body_com("tips_arm"), + self.get_body_com("object"), + self.get_body_com("goal"), + ] + ) diff --git a/gymnasium/envs/mujoco/reacher_v4.py b/gymnasium/envs/mujoco/reacher_v4.py index 49c6d4892..9bc233bc3 100644 --- a/gymnasium/envs/mujoco/reacher_v4.py +++ b/gymnasium/envs/mujoco/reacher_v4.py @@ -9,117 +9,6 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a - target that is spawned at a random position. - - ## Action Space - The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------| - | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | - | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | - - ## Observation Space - Observations consist of - - - The cosine of the angles of the two arms - - The sine of the angles of the two arms - - The coordinates of the target - - The angular velocities of the arms - - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0) - - The observation is a `Box(-Inf, Inf, (11,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ---------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | - | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | - | 2 | sine of the angle of the first arm | -Inf | Inf | sin(joint0) | hinge | unitless | - | 3 | sine of the angle of the second arm | -Inf | Inf | sin(joint1) | hinge | unitless | - | 4 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | - | 5 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | - | 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) | - | 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) | - | 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | - | 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | - | 10 | z-value of position_fingertip - position_target (constantly 0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) | - - - Most Gym environments just return the positions and velocity of the - joints in the `.xml` file as the state of the environment. However, in - reacher the state is created by combining only certain elements of the - position and velocity, and performing some function transformations on them. - If one is to read the `.xml` for reacher then they will find 4 joints: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------| - | 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) | - | 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) | - | 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | - | 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | - - - ## Rewards - The reward consists of two parts: - - *reward_distance*: This reward is a measure of how far the *fingertip* - of the reacher (the unattached end) is from the target, with a more negative - value assigned for when the reacher's *fingertip* is further away from the - target. It is calculated as the negative vector norm of (position of - the fingertip - position of target), or *-norm("fingertip" - "target")*. - - *reward_control*: A negative reward for penalising the walker if - it takes actions that are too large. It is measured as the negative squared - Euclidean norm of the action, i.e. as *- sum(action2)*. - - The total reward returned is ***reward*** *=* *reward_distance + reward_control* - - Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms. - However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, - you should create a wrapper that computes the weighted reward from `info`. - - - ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a noise added for stochasticity. A uniform noise in the range - [-0.1, 0.1] is added to the positional attributes, while the target position - is selected uniformly at random in a disk of radius 0.2 around the origin. - Independent, uniform noise in the - range of [-0.005, 0.005] is added to the velocities, and the last - element ("fingertip" - "target") is calculated at the end once everything - is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02* - - ## Episode End - - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps) - 2. Termination: Any of the state space values is no longer finite. - - ## Arguments - - No additional arguments are currently supported (in v2 and lower), - but modifications can be made to the XML file in the assets folder - (or by changing the path to a modified XML file in another folder).. - - ```python - import gymnasium as gym - env = gym.make('Reacher-v4') - ``` - - There is no v3 for Reacher, unlike the robot environments where a v3 and - beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py new file mode 100644 index 000000000..6b0bdd630 --- /dev/null +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -0,0 +1,239 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} + + +class ReacherEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a + target that is spawned at a random position. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Reacher-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Reacher-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Reacher-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max |Name (in corresponding XML file)| Joint | Type (Unit) | + |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------------|-------|--------------| + | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | + | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | + + + ## Observation Space + Observations consist of + + - The cosine of the angles of the two arms + - The sine of the angles of the two arms + - The coordinates of the target + - The angular velocities of the arms + - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0) + + The observation is a `Box(-Inf, Inf, (10,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ---------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | + | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | + | 2 | sine of the angle of the first arm | -Inf | Inf | sin(joint0) | hinge | unitless | + | 3 | sine of the angle of the second arm | -Inf | Inf | sin(joint1) | hinge | unitless | + | 4 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | + | 5 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | + | 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) | + | 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) | + | 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | + | 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | + | excluded | z-value of position_fingertip - position_target (constantly 0 since reacher is 2d) | -Inf | Inf | NA | slide | position (m) | + + Most Gym environments just return the positions and velocity of the + joints in the `.xml` file as the state of the environment. However, in + reacher the state is created by combining only certain elements of the + position and velocity, and performing some function transformations on them. + If one is to read the `reacher.xml` then they will find 4 joints: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------| + | 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) | + | 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) | + | 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | + | 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | + + + ## Rewards + The total reward is: ***reward*** *=* *reward_distance + reward_control*. + + - *reward_distance*: + This reward is a measure of how far the *fingertip* of the reacher (the unattached end) is from the target, + with a more negative value assigned for if the reacher's *fingertip* is further away from the target. + It is $-w_{near} \|(P_{fingertip} - P_{target})\|_2$. + where $w_{near}$ is the `reward_near_weight`. + - *reward_control*: + A negative reward to penalize the walker for taking actions that are too large. + It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. + where $w_{control}$ is the `reward_control_weight`. + + `info` contains the individual reward terms. + + ## Starting State + The initial position state of the reacher arm is $\mathcal{U}_{[-0.1 \times 1_{2}, 0.1 \times 1_{2}]}$. + The position state of the goal is (permanently) $\mathcal{S}(0.2)$. + The initial velocity state of the Reacher arm is $\mathcal{U}_{[-0.005 \times 1_{2}, 0.005 \times 1_{2}]}$. + The velocity state of the object is (permanently) $0_2$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution and $\mathcal{S} is the uniform continuous spherical distribution. + + The default frame rate is 2, with each frame lasting for 0.01, so *dt = 2 * 0.01 = 0.02*. + + + ## Episode End + #### Termination + The Reacher never terminates. + + #### Truncation + The default duration of an episode is 50 timesteps + + + ## Arguments + Reacher provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('Reacher-v5', xml_file=...) + ``` + + | Parameter | Type | Default |Description | + |-------------------------|------------|--------------|----------------------------------------------------------| + | `xml_file` | **str** |`"reacher.xml"`| Path to a MuJoCo model | + | `reward_dist_weight` | **float** | `1` | Weight for *reward_dist* term (see section on reward) | + | `reward_control_weight` | **float** | `0.1` | Weight for *reward_control* term (see section on reward) | + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Removed `"z - position_fingertip"` from the observation space since it is always 0, and therefore provides no useful information to the agent, this should result is slightly faster training (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Added `xml_file` argument. + - Added `reward_dist_weight`, `reward_control_weight` arguments, to configure the reward function (defaults are effectively the same as in `v4`). + - Fixed `info["reward_ctrl"]` being not being multiplied by the reward weight. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "reacher.xml", + frame_skip: int = 2, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + reward_dist_weight: float = 1, + reward_control_weight: float = 1, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + reward_dist_weight, + reward_control_weight, + **kwargs, + ) + + self._reward_dist_weight = reward_dist_weight + self._reward_control_weight = reward_control_weight + + observation_space = Box(low=-np.inf, high=np.inf, shape=(10,), dtype=np.float64) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + def step(self, action): + vec = self.get_body_com("fingertip") - self.get_body_com("target") + reward_dist = -np.linalg.norm(vec) * self._reward_dist_weight + reward_ctrl = -np.square(action).sum() * self._reward_control_weight + + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + reward = reward_dist + reward_ctrl + info = { + "reward_dist": reward_dist, + "reward_ctrl": reward_ctrl, + } + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def reset_model(self): + qpos = ( + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + + self.init_qpos + ) + while True: + self.goal = self.np_random.uniform(low=-0.2, high=0.2, size=2) + if np.linalg.norm(self.goal) < 0.2: + break + qpos[-2:] = self.goal + qvel = self.init_qvel + self.np_random.uniform( + low=-0.005, high=0.005, size=self.model.nv + ) + qvel[-2:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + theta = self.data.qpos.flat[:2] + return np.concatenate( + [ + np.cos(theta), + np.sin(theta), + self.data.qpos.flat[2:], + self.data.qvel.flat[:2], + (self.get_body_com("fingertip") - self.get_body_com("target"))[:2], + ] + ) diff --git a/gymnasium/envs/mujoco/swimmer_v4.py b/gymnasium/envs/mujoco/swimmer_v4.py index 6db5f4f08..3f597b86e 100644 --- a/gymnasium/envs/mujoco/swimmer_v4.py +++ b/gymnasium/envs/mujoco/swimmer_v4.py @@ -8,123 +8,6 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment corresponds to the Swimmer environment described in RĂ©mi Coulom's PhD thesis - ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document). - The environment aims to increase the number of independent state and control - variables as compared to the classic control environments. The swimmers - consist of three or more segments ('***links***') and one less articulation - joints ('***rotors***') - one rotor joint connecting exactly two links to - form a linear chain. The swimmer is suspended in a two dimensional pool and - always starts in the same position (subject to some deviation drawn from an - uniform distribution), and the goal is to move as fast as possible towards - the right by applying torque on the rotors and using the fluids friction. - - ## Notes - - The problem parameters are: - Problem parameters: - * *n*: number of body parts - * *mi*: mass of part *i* (*i* ∈ {1...n}) - * *li*: length of part *i* (*i* ∈ {1...n}) - * *k*: viscous-friction coefficient - - While the default environment has *n* = 3, *li* = 0.1, - and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the - number of links, or to tweak any of the parameters. - - ## Action Space - The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links* - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the first rotor | -1 | 1 | motor1_rot | hinge | torque (N m) | - | 1 | Torque applied on the second rotor | -1 | 1 | motor2_rot | hinge | torque (N m) | - - ## Observation Space - By default, observations consists of: - * Ξi: angle of part *i* with respect to the *x* axis - * Ξi': its derivative with respect to time (angular velocity) - - In the default case, observations do not include the x- and y-coordinates of the front tip. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - Then, the observation space will be `Box(-Inf, Inf, (10,), float64)` where the first two observations - represent the x- and y-coordinates of the front tip. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - By default, the observation is a `Box(-Inf, Inf, (8,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | angle of the front tip | -Inf | Inf | free_body_rot | hinge | angle (rad) | - | 1 | angle of the first rotor | -Inf | Inf | motor1_rot | hinge | angle (rad) | - | 2 | angle of the second rotor | -Inf | Inf | motor2_rot | hinge | angle (rad) | - | 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) | - | 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | - | 5 | angular velocity of front tip | -Inf | Inf | free_body_rot | hinge | angular velocity (rad/s) | - | 6 | angular velocity of first rotor | -Inf | Inf | motor1_rot | hinge | angular velocity (rad/s) | - | 7 | angular velocity of second rotor | -Inf | Inf | motor2_rot | hinge | angular velocity (rad/s) | - | excluded | position of the tip along the x-axis | -Inf | Inf | slider1 | slide | position (m) | - | excluded | position of the tip along the y-axis | -Inf | Inf | slider2 | slide | position (m) | - - ## Rewards - The reward consists of two parts: - - *forward_reward*: A reward of moving forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (default is 4), where the frametime is 0.01 - making the - default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer - swims right as desired. - - *ctrl_cost*: A cost for penalising the swimmer if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 1e-4 - - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity. - - ## Episode End - The episode truncates when the episode length is greater than 1000. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - gym.make('Swimmer-v4') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Swimmer-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"swimmer.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-4` | Weight for _ctrl_cost_ term (see section on reward) | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py new file mode 100644 index 000000000..df727261e --- /dev/null +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -0,0 +1,285 @@ +__credits__ = ["Kallinteris-Andreas", "Rushiv Arora"] + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +class SwimmerEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment corresponds to the Swimmer environment described in RĂ©mi Coulom's PhD thesis + ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document). + The environment aims to increase the number of independent state and control + variables as compared to the classic control environments. The swimmers + consist of three or more segments ('***links***') and one less articulation + joints ('***rotors***') - one rotor joint connecting exactly two links to + form a linear chain. The swimmer is suspended in a two dimensional pool and + always starts in the same position (subject to some deviation drawn from an + uniform distribution), and the goal is to move as fast as possible towards + the right by applying torque on the rotors and using the fluids friction. + + ## Notes + + The problem parameters are: + Problem parameters: + * *n*: number of body parts + * *mi*: mass of part *i* (*i* ∈ {1...n}) + * *li*: length of part *i* (*i* ∈ {1...n}) + * *k*: viscous-friction coefficient + + While the default environment has *n* = 3, *li* = 0.1, + and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the + number of links, or to tweak any of the parameters. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Swimmer-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Swimmer-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Swimmer-v3 | `mujoco-py` | Maintained for reproducibility | + | Swimmer-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links* + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the first rotor | -1 | 1 | motor1_rot | hinge | torque (N m) | + | 1 | Torque applied on the second rotor | -1 | 1 | motor2_rot | hinge | torque (N m) | + + + ## Observation Space + By default, observations consists of: + * Ξi: angle of part *i* with respect to the *x* axis + * Ξi': its derivative with respect to time (angular velocity) + + By default, the observation does not include the x- and y-coordinates of the front tip. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (10,), float64)`, where the first two observations are the x- and y-coordinates of the front tip. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, the observation is a `Box(-Inf, Inf, (8,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | angle of the front tip | -Inf | Inf | free_body_rot | hinge | angle (rad) | + | 1 | angle of the first rotor | -Inf | Inf | motor1_rot | hinge | angle (rad) | + | 2 | angle of the second rotor | -Inf | Inf | motor2_rot | hinge | angle (rad) | + | 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) | + | 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | + | 5 | angular velocity of front tip | -Inf | Inf | free_body_rot | hinge | angular velocity (rad/s) | + | 6 | angular velocity of first rotor | -Inf | Inf | motor1_rot | hinge | angular velocity (rad/s) | + | 7 | angular velocity of second rotor | -Inf | Inf | motor2_rot | hinge | angular velocity (rad/s) | + | excluded | position of the tip along the x-axis | -Inf | Inf | slider1 | slide | position (m) | + | excluded | position of the tip along the y-axis | -Inf | Inf | slider2 | slide | position (m) | + + + ## Rewards + The total reward is: ***reward*** *=* *forward_reward - ctrl_cost*. + + - *forward_reward*: + A reward for moving forward, + this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 4), + and `frametime` which is 0.01 - so the default is $dt = 4 \times 0.01 = 0.04$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward to penalize the Swimmer for taking actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-4}$). + + `info` contains the individual reward terms. + + + ## Starting State + The initial position state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{5}, reset\_noise\_scale \times 1_{5}]}$. + The initial velocity state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{5}, reset\_noise\_scale \times 1_{5}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + + ## Episode End + #### Termination + The Swimmer never terminates. + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + Swimmer provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('Swimmer-v5', xml_file=...) + ``` + + | Parameter | Type | Default |Description | + |--------------------------------------------| --------- |-------------- |-------------------------------| + |`xml_file` | **str** |`"swimmer.xml"`| Path to a MuJoCo model | + |`forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward)| + |`ctrl_cost_weight` | **float** | `1e-4` | Weight for _ctrl_cost_ term (see section on reward) | + |`reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + |`exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Restored the `xml_file` argument (was removed in `v4`). + - Added `forward_reward_weight`, `ctrl_cost_weight`, to configure the reward function (defaults are effectively the same as in `v4`). + - Added `reset_noise_scale` argument to set the range of initial states. + - Added `exclude_current_positions_from_observation` argument. + - Replaced `info["reward_fwd"]` and `info["forward_reward"]` with `info["reward_forward"]` to be consistent with the other environments. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen). + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "swimmer.xml", + frame_skip: int = 4, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-4, + reset_noise_scale: float = 0.1, + exclude_current_positions_from_observation: bool = True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + forward_reward_weight, + ctrl_cost_weight, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__(self, xml_file, frame_skip, observation_space=None, **kwargs) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - 2 * exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + xy_position_before = self.data.qpos[0:2].copy() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.data.qpos[0:2].copy() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + forward_reward = self._forward_reward_weight * x_velocity + + ctrl_cost = self.control_cost(action) + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "x_position": xy_position_after[0], + "y_position": xy_position_after[1], + "distance_from_origin": np.linalg.norm(xy_position_after, ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + } + + if self.render_mode == "human": + self.render() + + return observation, reward, False, False, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + observation = np.concatenate([position, velocity]).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + } diff --git a/gymnasium/envs/mujoco/walker2d_v4.py b/gymnasium/envs/mujoco/walker2d_v4.py index 204f17644..fade5de41 100644 --- a/gymnasium/envs/mujoco/walker2d_v4.py +++ b/gymnasium/envs/mujoco/walker2d_v4.py @@ -14,138 +14,6 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment - by adding another set of legs making it possible for the robot to walk forward instead of - hop. Like other Mujoco environments, this environment aims to increase the number of independent state - and control variables as compared to the classic control environments. The walker is a - two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top - (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs - in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. - The goal is to walk in the in the forward (right) - direction by applying torques on the six hinges connecting the seven body parts. - - ## Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | - | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | - | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the walker, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the torso. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be `Box(-Inf, Inf, (18,), float64)` where the first observation - represent the x-coordinates of the torso of the walker. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - of the torso will be returned in `info` with key `"x_position"`. - - By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | - | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | - | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | - | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | - | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | - | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | - | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`, - - *forward_reward*: A reward of walking forward which is measured as - *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. - *dt* is the time between actions and is dependeent on the frame_skip parameter - (default is 4), where the frametime is 0.002 - making the default - *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (positive x direction). - - *ctrl_cost*: A cost for penalising the walker if it - takes actions that are too large. It is measured as - *`ctrl_cost_weight` * sum(action2)* where *`ctrl_cost_weight`* is - a parameter set for the control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ## Episode End - The walker is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` - 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The walker is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4') - ``` - - v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | - | `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | - | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the torso of the walker must be in this range to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py new file mode 100644 index 000000000..aa43a9875 --- /dev/null +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -0,0 +1,354 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict, Tuple + +import numpy as np + +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 2, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 1.15)), + "elevation": -20.0, +} + + +class Walker2dEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment + by adding another set of legs making it possible for the robot to walk forward instead of + hop. Like other Mujoco environments, this environment aims to increase the number of independent state + and control variables as compared to the classic control environments. The walker is a + two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top + (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs + in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. + The goal is to walk in the in the forward (right) + direction by applying torques on the six hinges connecting the seven body parts. + + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Walker2d-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Walker2d-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Walker2d-v3 | `mujoco-py` | Maintained for reproducibility | + | Walker2d-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + + + ## Action Space + The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | + | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | + | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | + + + ## Observation Space + The observation Space consists of the following parts (in order): + + - qpos (8 elements by default):* Position values of the robots's body parts. + - qvel (9 elements):* The velocities of these individual body parts, + (their derivatives). + + By default, the observation does not include the robot's x-coordinate (`rootx`). + This can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (18,), float64)`, where the first observation element is the x--coordinate of the robot. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | + | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | + | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | + | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | + | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | + | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | + | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | + + + ## Rewards + The total reward is: ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost*. + + - *healthy_reward*: + Every timestep that the Walker2d is alive, it receives a fixed reward of value `healthy_reward`, + - *forward_reward*: + A reward for moving forward, + this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 4), + and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward to penalize the Walker2d for taking actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). + + `info` contains the individual reward terms. + + + ## Starting State + The initial position state is $[0, 1.25, 0, 0, 0, 0, 0, 0, 0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{9}, reset\_noise\_scale \times 1_{9}]}$. + The initial velocity state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{9}, reset\_noise\_scale \times 1_{9}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z-coordinate is non-zero so that the walker2d can stand up immediately. + + + ## Episode End + #### Termination + If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Walker2d is unhealthy. + The Walker2d is unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite + 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` + 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` + + #### Truncation + The default duration of an episode is 1000 timesteps + + + ## Arguments + Walker2d provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: + + ```python + import gymnasium as gym + env = gym.make('Walker2d-v5', ctrl_cost_weight=1e-3, ...) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** |`"walker2d_v5.xml"`| Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | + | `healthy_reward` | **float** | `1` | Weight for _healthy_reward_ reward (see section on reward) | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | + | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the torso of the walker must be in this range to be considered healthy | + | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | + | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + + ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - In v2, v3 and v4 the models have different friction values for the two feet (left foot friction == 1.9 and right foot friction == 0.9). The `Walker-v5` model is updated to have the same friction for both feet (set to 1.9). This causes the Walker2d's the right foot to slide less on the surface and therefore require more force to move (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/477)). + - Fixed bug: `healthy_reward` was given on every step (even if the Walker2D is unhealthy), now it is only given if the Walker2d is healthy. The `info` "reward_survive" is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added individual reward terms in `info` (`info["reward_forward"]`, info`["reward_ctrl"]`, `info["reward_survive"]`). + - Added `info["z_distance_from_origin"]` which is equal to the vertical distance of the "torso" body from its initial position. + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "walker2d_v5.xml", + frame_skip: int = 4, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-3, + healthy_reward: float = 1.0, + terminate_when_unhealthy: bool = True, + healthy_z_range: Tuple[float, float] = (0.8, 2.0), + healthy_angle_range: Tuple[float, float] = (-1.0, 1.0), + reset_noise_scale: float = 5e-3, + exclude_current_positions_from_observation: bool = True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + healthy_reward, + terminate_when_unhealthy, + healthy_z_range, + healthy_angle_range, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 1 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 1 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.data.qpos[1:3] + + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + is_healthy = healthy_z and healthy_angle + + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + return terminated + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = np.clip(self.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_survive": healthy_reward, + "x_position": x_position_after, + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + "x_velocity": x_velocity, + } + + if self.render_mode == "human": + self.render() + + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + } diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py new file mode 100644 index 000000000..5dfd4cb3a --- /dev/null +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -0,0 +1,676 @@ +import collections +import warnings + +import mujoco +import numpy as np +import pytest + +import gymnasium as gym +from gymnasium.envs.mujoco.mujoco_env import BaseMujocoEnv, MujocoEnv +from gymnasium.error import Error +from gymnasium.utils.env_checker import check_env +from gymnasium.utils.env_match import check_environments_match + + +ALL_MUJOCO_ENVS = [ + "Ant", + "HalfCheetah", + "Hopper", + "Humanoid", + "HumanoidStandup", + "InvertedDoublePendulum", + "InvertedPendulum", + "Pusher", + "Reacher", + "Swimmer", + "Walker2d", +] + + +# Note: "HumnanoidStandup-v4" does not have `info` +# Note: "Humnanoid-v4/3" & "Ant-v4/3" fail this test +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "HalfCheetah-v5", + "HalfCheetah-v4", + "HalfCheetah-v3", + "Hopper-v5", + "Hopper-v4", + "Hopper-v3", + "Humanoid-v5", + "HumanoidStandup-v5", + "Swimmer-v5", + "Swimmer-v4", + "Swimmer-v3", + "Walker2d-v5", + "Walker2d-v4", + "Walker2d-v3", + ], +) +def test_verify_info_x_position(env_id: str): + """Asserts that the environment has position[0] == info['x_position'].""" + env = gym.make(env_id, exclude_current_positions_from_observation=False) + + _, _ = env.reset() + obs, _, _, _, info = env.step(env.action_space.sample()) + + assert obs[0] == info["x_position"] + + +# Note: "HumnanoidStandup-v4" does not have `info` +# Note: "Humnanoid-v4/3" & "Ant-v4/3" fail this test +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "Humanoid-v5", + "HumanoidStandup-v5", + "Swimmer-v5", + "Swimmer-v4", + "Swimmer-v3", + ], +) +def test_verify_info_y_position(env_id: str): + """Asserts that the environment has position[1] == info['y_position'].""" + env = gym.make(env_id, exclude_current_positions_from_observation=False) + + _, _ = env.reset() + obs, _, _, _, info = env.step(env.action_space.sample()) + + assert obs[1] == info["y_position"] + + +# Note: "HumnanoidStandup-v4" does not have `info` +@pytest.mark.parametrize("env_name", ["HalfCheetah", "Hopper", "Swimmer", "Walker2d"]) +@pytest.mark.parametrize("version", ["v5", "v4", "v3"]) +def test_verify_info_x_velocity(env_name: str, version: str): + """Asserts that the environment `info['x_velocity']` is properly assigned.""" + env = gym.make(f"{env_name}-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() + + old_x = env.data.qpos[0] + _, _, _, _, info = env.step(env.action_space.sample()) + new_x = env.data.qpos[0] + + dx = new_x - old_x + vel_x = dx / env.dt + assert vel_x == info["x_velocity"] + + +# Note: "HumnanoidStandup-v4" does not have `info` +@pytest.mark.parametrize("env_id", ["Swimmer-v5", "Swimmer-v4", "Swimmer-v3"]) +def test_verify_info_y_velocity(env_id: str): + """Asserts that the environment `info['y_velocity']` is properly assigned.""" + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() + + old_y = env.data.qpos[1] + _, _, _, _, info = env.step(env.action_space.sample()) + new_y = env.data.qpos[1] + + dy = new_y - old_y + vel_y = dy / env.dt + assert vel_y == info["y_velocity"] + + +@pytest.mark.parametrize("env_id", ["Ant-v5", "Ant-v4", "Ant-v3"]) +def test_verify_info_xy_velocity_xpos(env_id: str): + """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the ant environment which uses kinmatics for the velocity.""" + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() + + old_xy = env.get_body_com("torso")[:2].copy() + _, _, _, _, info = env.step(env.action_space.sample()) + new_xy = env.get_body_com("torso")[:2].copy() + + dxy = new_xy - old_xy + vel_x, vel_y = dxy / env.dt + assert vel_x == info["x_velocity"] + assert vel_y == info["y_velocity"] + + +@pytest.mark.parametrize("env_id", ["Humanoid-v5", "Humanoid-v4", "Humanoid-v3"]) +def test_verify_info_xy_velocity_com(env_id: str): + """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the humanoid environment which uses kinmatics of Center Of Mass for the velocity.""" + + def mass_center(model, data): + mass = np.expand_dims(model.body_mass, axis=1) + xpos = data.xipos + return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() + + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() + + old_xy = mass_center(env.model, env.data) + _, _, _, _, info = env.step(env.action_space.sample()) + new_xy = mass_center(env.model, env.data) + + dxy = new_xy - old_xy + vel_x, vel_y = dxy / env.dt + assert vel_x == info["x_velocity"] + assert vel_y == info["y_velocity"] + + +# Note: Hopper-v4/3/2 does not have `info['reward_survive']`, but it is still affected +# Note: Walker2d-v4/3/2 does not have `info['reward_survive']`, but it is still affected +# Note: Inverted(Double)Pendulum-v4/2 does not have `info['reward_survive']`, but it is still affected +# Note: all `v4/v3/v2` environments with a heathly reward are fail this test +@pytest.mark.parametrize( + "env_name", + [ + "Ant", + "Hopper", + "Humanoid", + "InvertedDoublePendulum", + "InvertedPendulum", + "Walker2d", + ], +) +@pytest.mark.parametrize("version", ["v5"]) +def test_verify_reward_survive(env_name: str, version: str): + """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states.""" + env = gym.make(f"{env_name}-{version}", reset_noise_scale=0).unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset(seed=0) + env.action_space.seed(1) + + terminal = False + for step in range(80): + obs, rew, terminal, truncated, info = env.step(env.action_space.sample()) + + if terminal: + assert info["reward_survive"] == 0 + break + + assert info["reward_survive"] != 0 + + assert ( + terminal + ), "The environment, should have terminated, if not the test is not valid." + + +CHECK_ENV_IGNORE_WARNINGS = [ + f"\x1b[33mWARN: {message}\x1b[0m" + for message in [ + "A Box observation space minimum value is -infinity. This is probably too low.", + "A Box observation space maximum value is -infinity. This is probably too high.", + "For Box action spaces, we recommend using a symmetric and normalized space (range=[-1, 1] or [0, 1]). See https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html for more information.", + ] +] + + +@pytest.mark.parametrize("env_name", ALL_MUJOCO_ENVS) +@pytest.mark.parametrize("version", ["v5"]) +@pytest.mark.parametrize("frame_skip", [1, 2, 3, 4, 5]) +def test_frame_skip(env_name: str, version: str, frame_skip: int): + """Verify that all `mujoco` envs work with different `frame_skip` values.""" + env_id = f"{env_name}-{version}" + env = gym.make(env_id, frame_skip=frame_skip) + + # Test if env adheres to Gym API + with warnings.catch_warnings(record=True) as w: + check_env(env.unwrapped, skip_render_check=True) + env.close() + for warning in w: + if warning.message.args[0] not in CHECK_ENV_IGNORE_WARNINGS: + raise Error(f"Unexpected warning: {warning.message}") + + +# Dev Note: This can not be version env parametrized because each env has it's own reward function +@pytest.mark.parametrize("version", ["v5"]) +def test_reward_sum(version: str): + """Assert that the total reward equals the sum of the individual reward terms, also asserts that the reward function has no fp ordering arithmetic errors.""" + NUM_STEPS = 100 + env = gym.make(f"Ant-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == (info["reward_survive"] + info["reward_forward"]) - ( + -info["reward_ctrl"] + -info["reward_contact"] + ) + + env = gym.make(f"HalfCheetah-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_forward"] + info["reward_ctrl"] + + env = gym.make(f"Hopper-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert ( + reward + == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] + ) + + env = gym.make(f"Humanoid-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == (info["reward_forward"] + info["reward_survive"]) + ( + info["reward_ctrl"] + info["reward_contact"] + ) + + env = gym.make(f"HumanoidStandup-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert ( + reward + == info["reward_linup"] + + info["reward_quadctrl"] + + info["reward_impact"] + + 1 + ) + + env = gym.make(f"InvertedDoublePendulum-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert ( + reward + == info["reward_survive"] + + info["distance_penalty"] + + info["velocity_penalty"] + ) + + env = gym.make(f"InvertedPendulum-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_survive"] + + env = gym.make(f"Pusher-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_dist"] + info["reward_ctrl"] + info["reward_near"] + + env = gym.make(f"Reacher-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_dist"] + info["reward_ctrl"] + + env = gym.make(f"Swimmer-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_forward"] + info["reward_ctrl"] + + env = gym.make(f"Walker2d-{version}") + env.reset() + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert ( + reward + == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] + ) + + +env_conf = collections.namedtuple("env_conf", "env_name, obs, rew, term, info") + + +# Note: the environtments "HalfCheetah", "Pusher", "Swimmer", are identical between `v4` & `v5` (excluding `info`) +@pytest.mark.parametrize( + "env_conf", + [ + env_conf("Ant", True, True, False, "skip"), + env_conf("HalfCheetah", False, False, False, "skip"), + env_conf("Hopper", False, True, False, "superset"), + # skipping humanoid, everything has changed + env_conf("HumanoidStandup", True, False, False, "superset"), + env_conf("InvertedDoublePendulum", True, True, False, "superset"), + env_conf("InvertedPendulum", False, True, False, "superset"), + env_conf("Pusher", False, False, False, "keys-superset"), + env_conf("Reacher", True, False, False, "keys-equivalence"), + env_conf("Swimmer", False, False, False, "skip"), + env_conf("Walker2d", True, True, True, "keys-superset"), + ], +) +def test_identical_behaviour_v45(env_conf): + """Verify that v4 -> v5 transition. Does not change the behaviour of the environments in any unexpected way.""" + NUM_STEPS = 100 + env_v4 = gym.make(f"{env_conf.env_name}-v4") + env_v5 = gym.make(f"{env_conf.env_name}-v5") + + check_environments_match( + env_v4, + env_v5, + NUM_STEPS, + skip_obs=env_conf.obs, + skip_rew=env_conf.rew, + skip_terminal=env_conf.term, + info_comparison=env_conf.info, + ) + + +@pytest.mark.parametrize("version", ["v5", "v4"]) +def test_ant_com(version: str): + """Verify the kinmatic behaviour of the ant.""" + # `env` contains `data : MjData` and `model : MjModel` + env = gym.make(f"Ant-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() # randomly initlizies the `data.qpos` and `data.qvel`, calls mujoco.mj_forward(env.model, env.data) + + x_position_before = env.data.qpos[0] + x_position_before_com = env.data.body("torso").xpos[0] + assert x_position_before == x_position_before_com, "before failed" # This succeeds + + random_control = env.action_space.sample() + # This calls mujoco.mj_step(env.model, env.data, nstep=env.frame_skip) + _, _, _, _, info = env.step(random_control) + mujoco.mj_kinematics(env.model, env.data) + + x_position_after = env.data.qpos[0] + x_position_after_com = env.data.body("torso").xpos[0] + assert x_position_after == x_position_after_com, "after failed" # This succeeds + + +@pytest.mark.parametrize("version", ["v5", "v4", "v3", "v2"]) +def test_set_state(version: str): + """Simple Test to verify that `mujocoEnv.set_state()` works correctly.""" + env = gym.make(f"Hopper-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() + new_qpos = np.array( + [0.00136962, 1.24769787, -0.00459026, -0.00483472, 0.0031327, 0.00412756] + ) + new_qvel = np.array( + [0.00106636, 0.00229497, 0.00043625, 0.00435072, 0.00315854, -0.00497261] + ) + env.set_state(new_qpos, new_qvel) + assert (env.data.qpos == new_qpos).all() + assert (env.data.qvel == new_qvel).all() + + +# Note: HumanoidStandup-v4/v3 does not have `info` +# Note: Ant-v4/v3 fails this test +# Note: Humanoid-v4/v3 fails this test +# Note: v2 does not have `info` +@pytest.mark.parametrize( + "env_id", ["Ant-v5", "Humanoid-v5", "Swimmer-v5", "Swimmer-v4", "Swimmer-v3"] +) +def test_distance_from_origin_info(env_id: str): + """Verify that `info"distance_from_origin"` is correct.""" + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() + + _, _, _, _, info = env.step(env.action_space.sample()) + assert info["distance_from_origin"] == np.linalg.norm( + env.data.qpos[0:2] - env.init_qpos[0:2] + ) + + +@pytest.mark.parametrize("env_name", ["Hopper", "HumanoidStandup", "Walker2d"]) +@pytest.mark.parametrize("version", ["v5"]) +def test_z_distance_from_origin_info(env_name: str, version: str): + """Verify that `info["z_distance_from_origin"]` is correct.""" + env = gym.make(f"{env_name}-{version}").unwrapped + assert isinstance(env, MujocoEnv) + env.reset() + + _, _, _, _, info = env.step(env.action_space.sample()) + mujoco.mj_kinematics(env.model, env.data) + z_index = env.observation_structure["skipped_qpos"] + assert ( + info["z_distance_from_origin"] + == env.data.qpos[z_index] - env.init_qpos[z_index] + ) + + +@pytest.mark.parametrize("env_name", ALL_MUJOCO_ENVS) +@pytest.mark.parametrize("version", ["v5"]) +def test_observation_structure(env_name: str, version: str): + """Verify that the `env.observation_structure` is properly defined.""" + env = gym.make(f"{env_name}-{version}").unwrapped + assert isinstance(env, MujocoEnv) + if not hasattr(env, "observation_structure"): + return + + obs_struct = env.observation_structure + + assert env.model.nq == obs_struct.get("skipped_qpos", 0) + obs_struct["qpos"] + assert env.model.nv == obs_struct["qvel"] + if obs_struct.get("cinert", False): + assert (env.model.nbody - 1) * 10 == obs_struct["cinert"] + if obs_struct.get("cvel", False): + assert (env.model.nbody - 1) * 6 == obs_struct["cvel"] + if obs_struct.get("qfrc_actuator", False): + assert env.model.nv - 6 == obs_struct["qfrc_actuator"] + if obs_struct.get("cfrc_ext", False): + assert (env.model.nbody - 1) * 6 == obs_struct["cfrc_ext"] + if obs_struct.get("ten_lenght", False): + assert env.model.ntendon == obs_struct["ten_lenght"] + if obs_struct.get("ten_velocity", False): + assert env.model.ntendon == obs_struct["ten_velocity"] + + +@pytest.mark.parametrize( + "env_name", + [ + "Ant", + "HalfCheetah", + "Hopper", + "Humanoid", + "HumanoidStandup", + # "InvertedDoublePendulum", + # "InvertedPendulum", + # "Pusher", + # "Reacher", + "Swimmer", + "Walker2d", + ], +) +@pytest.mark.parametrize("version", ["v5"]) +def test_reset_info(env_name: str, version: str): + """Verify that the environment returns info with `reset()`.""" + env = gym.make(f"{env_name}-{version}") + _, reset_info = env.reset() + assert len(reset_info) > 0 + + +# Note: the max height used to be wrong in the documentation. (1.196m instead of 1.2m) +@pytest.mark.parametrize("version", ["v5"]) +def test_inverted_double_pendulum_max_height(version: str): + """Verify the max height of Inverted Double Pendulum.""" + env = gym.make(f"InvertedDoublePendulum-{version}", reset_noise_scale=0).unwrapped + assert isinstance(env, BaseMujocoEnv) + env.reset() + + y = env.data.site_xpos[0][2] + assert y == 1.2 + + +@pytest.mark.parametrize("version", ["v4"]) +def test_inverted_double_pendulum_max_height_old(version: str): + """Verify the max height of Inverted Double Pendulum (v4 does not have `reset_noise_scale` argument).""" + env = gym.make(f"InvertedDoublePendulum-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + env.set_state(env.init_qpos, env.init_qvel) + + y = env.data.site_xpos[0][2] + assert y == 1.2 + + +# note: fails with `brax==0.9.0` +@pytest.mark.parametrize("version", ["v5", "v4"]) +def test_model_object_count(version: str): + """Verify that all the objects of the model are loaded, mostly useful for using non-mujoco simulator.""" + env = gym.make(f"Ant-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 15 + assert env.model.nv == 14 + assert env.model.nu == 8 + assert env.model.nbody == 14 + assert env.model.nbvh == 14 + assert env.model.njnt == 9 + assert env.model.ngeom == 14 + assert env.model.ntendon == 0 + + env = gym.make(f"HalfCheetah-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 9 + assert env.model.nv == 9 + assert env.model.nu == 6 + assert env.model.nbody == 8 + assert env.model.nbvh == 10 + assert env.model.njnt == 9 + assert env.model.ngeom == 9 + assert env.model.ntendon == 0 + + env = gym.make(f"Hopper-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 6 + assert env.model.nv == 6 + assert env.model.nu == 3 + assert env.model.nbody == 5 + assert env.model.nbvh == 5 + assert env.model.njnt == 6 + assert env.model.ngeom == 5 + assert env.model.ntendon == 0 + + env = gym.make(f"Humanoid-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 24 + assert env.model.nv == 23 + assert env.model.nu == 17 + assert env.model.nbody == 14 + assert env.model.nbvh == 22 + assert env.model.njnt == 18 + assert env.model.ngeom == 18 + assert env.model.ntendon == 2 + + env = gym.make(f"HumanoidStandup-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 24 + assert env.model.nv == 23 + assert env.model.nu == 17 + assert env.model.nbody == 14 + assert env.model.nbvh == 22 + assert env.model.njnt == 18 + assert env.model.ngeom == 18 + assert env.model.ntendon == 2 + + env = gym.make(f"InvertedDoublePendulum-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 3 + assert env.model.nv == 3 + assert env.model.nu == 1 + assert env.model.nbody == 4 + assert env.model.nbvh == 6 + assert env.model.njnt == 3 + assert env.model.ngeom == 5 + assert env.model.ntendon == 0 + + env = gym.make(f"InvertedPendulum-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 2 + assert env.model.nv == 2 + assert env.model.nu == 1 + assert env.model.nbody == 3 + assert env.model.nbvh == 3 + assert env.model.njnt == 2 + assert env.model.ngeom == 3 + assert env.model.ntendon == 0 + + env = gym.make(f"Pusher-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 11 + assert env.model.nv == 11 + assert env.model.nu == 7 + assert env.model.nbody == 13 + assert env.model.nbvh == 18 + assert env.model.njnt == 11 + assert env.model.ngeom == 21 + assert env.model.ntendon == 0 + + env = gym.make(f"Reacher-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 4 + assert env.model.nv == 4 + assert env.model.nu == 2 + assert env.model.nbody == 5 + assert env.model.nbvh == 5 + assert env.model.njnt == 4 + assert env.model.ngeom == 10 + assert env.model.ntendon == 0 + + env = gym.make(f"Swimmer-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 5 + assert env.model.nv == 5 + assert env.model.nu == 2 + assert env.model.nbody == 4 + assert env.model.nbvh == 4 + assert env.model.njnt == 5 + assert env.model.ngeom == 4 + assert env.model.ntendon == 0 + + env = gym.make(f"Walker2d-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 9 + assert env.model.nv == 9 + assert env.model.nu == 6 + assert env.model.nbody == 8 + assert env.model.nbvh == 8 + assert env.model.njnt == 9 + assert env.model.ngeom == 8 + assert env.model.ntendon == 0 + + +def test_dt(): + """Assert that env.dt gets assigned correctly.""" + env_a = gym.make("Ant-v5", include_cfrc_ext_in_observation=False).unwrapped + env_b = gym.make( + "Ant-v5", include_cfrc_ext_in_observation=False, frame_skip=1 + ).unwrapped + assert isinstance(env_a, BaseMujocoEnv) + assert isinstance(env_b, BaseMujocoEnv) + env_b.model.opt.timestep = 0.05 + + assert env_a.dt == env_b.dt + # check_environments_match(env_a, env_b, num_steps=100) # This Fails as expected + + +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "Ant-v4", + "Ant-v3", + "HalfCheetah-v5", + "HalfCheetah-v4", + "HalfCheetah-v3", + "Hopper-v5", + "Hopper-v4", + "Hopper-v3", + "Humanoid-v5", + "Humanoid-v4", + "Humanoid-v3", + "HumanoidStandup-v5", + "InvertedDoublePendulum-v5", + "InvertedPendulum-v5", + "Swimmer-v5", + "Swimmer-v4", + "Swimmer-v3", + "Walker2d-v5", + "Walker2d-v4", + "Walker2d-v3", + ], +) +def test_reset_noise_scale(env_id): + """Checks that when `reset_noise_scale=0` we have deterministic initialization.""" + env: BaseMujocoEnv = gym.make(env_id, reset_noise_scale=0).unwrapped + env.reset() + + assert np.all(env.data.qpos == env.init_qpos) + assert np.all(env.data.qvel == env.init_qvel) diff --git a/tests/envs/registration/test_make.py b/tests/envs/registration/test_make.py index 0262c8831..2d54f2ed6 100644 --- a/tests/envs/registration/test_make.py +++ b/tests/envs/registration/test_make.py @@ -500,7 +500,7 @@ def test_make_errors(): with pytest.raises( gym.error.Error, match=re.escape( - "Environment version v0 for `Humanoid` is deprecated. Please use `Humanoid-v4` instead." + "Environment version v0 for `Humanoid` is deprecated. Please use `Humanoid-v5` instead." ), ): gym.make("Humanoid-v0")