From cf153f66ad14ac83d1ae997e9dc12acf2d3c618a Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas <30759571+Kallinteris-Andreas@users.noreply.github.com> Date: Tue, 2 May 2023 17:55:43 +0300 Subject: [PATCH 01/53] Add Hopper and Walker2D models for v5 --- gymnasium/envs/mujoco/assets/hopper_v5.xml | 53 +++++++++++++++ gymnasium/envs/mujoco/assets/walker2d_v5.xml | 68 ++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 gymnasium/envs/mujoco/assets/hopper_v5.xml create mode 100644 gymnasium/envs/mujoco/assets/walker2d_v5.xml diff --git a/gymnasium/envs/mujoco/assets/hopper_v5.xml b/gymnasium/envs/mujoco/assets/hopper_v5.xml new file mode 100644 index 000000000..e9ec942d7 --- /dev/null +++ b/gymnasium/envs/mujoco/assets/hopper_v5.xml @@ -0,0 +1,53 @@ + + + + + + + + + diff --git a/gymnasium/envs/mujoco/assets/walker2d_v5.xml b/gymnasium/envs/mujoco/assets/walker2d_v5.xml new file mode 100644 index 000000000..cf8042a7c --- /dev/null +++ b/gymnasium/envs/mujoco/assets/walker2d_v5.xml @@ -0,0 +1,68 @@ + + + + + + + + From 0cbdd72943166cc174242a558fde30843fd6b21b Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas <30759571+Kallinteris-Andreas@users.noreply.github.com> Date: Tue, 9 May 2023 12:13:49 +0300 Subject: [PATCH 02/53] Delete hopper_v5.xml --- gymnasium/envs/mujoco/assets/hopper_v5.xml | 53 ---------------------- 1 file changed, 53 deletions(-) delete mode 100644 gymnasium/envs/mujoco/assets/hopper_v5.xml diff --git a/gymnasium/envs/mujoco/assets/hopper_v5.xml b/gymnasium/envs/mujoco/assets/hopper_v5.xml deleted file mode 100644 index e9ec942d7..000000000 --- a/gymnasium/envs/mujoco/assets/hopper_v5.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - From db3734ea0c23698724e05b1f72042b7dca9b8b7b Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas <30759571+Kallinteris-Andreas@users.noreply.github.com> Date: Tue, 9 May 2023 12:14:07 +0300 Subject: [PATCH 03/53] Delete walker2d_v5.xml --- gymnasium/envs/mujoco/assets/walker2d_v5.xml | 68 -------------------- 1 file changed, 68 deletions(-) delete mode 100644 gymnasium/envs/mujoco/assets/walker2d_v5.xml diff --git a/gymnasium/envs/mujoco/assets/walker2d_v5.xml b/gymnasium/envs/mujoco/assets/walker2d_v5.xml deleted file mode 100644 index cf8042a7c..000000000 --- a/gymnasium/envs/mujoco/assets/walker2d_v5.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - From a2d2e64a4595a046ab2b826bb2de3d8f82f9b4bf Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 9 May 2023 16:19:18 +0300 Subject: [PATCH 04/53] General MuJoCo Env Documention Cleanup --- gymnasium/envs/mujoco/ant_v4.py | 1 - gymnasium/envs/mujoco/half_cheetah_v4.py | 17 +- gymnasium/envs/mujoco/hopper_v4.py | 18 +- gymnasium/envs/mujoco/humanoid_v4.py | 21 +- gymnasium/envs/mujoco/humanoidstandup_v4.py | 93 +++++-- gymnasium/envs/mujoco/pusher_v4.py | 2 +- gymnasium/envs/mujoco/reacher_v4.py | 7 +- gymnasium/envs/mujoco/swimmer_v4.py | 7 +- gymnasium/envs/mujoco/walker2d_v4.py | 257 ++++++++++---------- 9 files changed, 237 insertions(+), 186 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v4.py b/gymnasium/envs/mujoco/ant_v4.py index b95b3aa51..9775aad59 100644 --- a/gymnasium/envs/mujoco/ant_v4.py +++ b/gymnasium/envs/mujoco/ant_v4.py @@ -38,7 +38,6 @@ class AntEnv(MujocoEnv, utils.EzPickle): | 7 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | ## Observation Space - Observations consist of positional values of different body parts of the ant, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. diff --git a/gymnasium/envs/mujoco/half_cheetah_v4.py b/gymnasium/envs/mujoco/half_cheetah_v4.py index b0286ec0d..13ba74669 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v4.py +++ b/gymnasium/envs/mujoco/half_cheetah_v4.py @@ -18,7 +18,7 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): This environment is based on the work by P. Wawrzyński in ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf). - The HalfCheetah is a 2-dimensional robot consisting of 9 links and 8 + The HalfCheetah is a 2-dimensional robot consisting of 9 body parts and 8 joints connecting them (including two paws). The goal is to apply a torque on the joints to make the cheetah run forward (right) as fast as possible, with a positive reward allocated based on the distance moved forward and a @@ -28,7 +28,7 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): (connecting to the thighs) and feet (connecting to the shins). ## Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied between *links*. + The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | | --- | --------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | @@ -41,22 +41,21 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): ## Observation Space - Observations consist of positional values of different body parts of the cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - By default, observations do not include the x-coordinate of the cheetah's center of mass. It may + By default, observations do not include the cheetah's `rootx`. It may be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 18 dimensions where the first dimension - represents the x-coordinate of the cheetah's center of mass. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate + In that case, the observation will be a `Box(-Inf, Inf, (18,), float64)` where the first element + represents the `rootx`/ + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the will be returned in `info` with key `"x_position"`. - However, by default, the observation is a `ndarray` with shape `(17,)` where the elements correspond to the following: - + However, by default, the observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | + | excluded | x-coordinate of the front tip | -Inf | Inf | rootx | slide | position (m) | | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | diff --git a/gymnasium/envs/mujoco/hopper_v4.py b/gymnasium/envs/mujoco/hopper_v4.py index e4191beea..e399aa0c1 100644 --- a/gymnasium/envs/mujoco/hopper_v4.py +++ b/gymnasium/envs/mujoco/hopper_v4.py @@ -28,7 +28,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): connecting the four body parts. ## Action Space - The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied between *links* + The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied at the hinge joints. | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| @@ -37,31 +37,31 @@ class HopperEnv(MujocoEnv, utils.EzPickle): | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | ## Observation Space - Observations consist of positional values of different body parts of the hopper, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. By default, observations do not include the x-coordinate of the hopper. It may be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 12 dimensions where the first dimension + In that case, the observation space will be `Box(-Inf, Inf, (12,), float64)` where the first observation represents the x-coordinate of the hopper. Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate will be returned in `info` with key `"x_position"`. - However, by default, the observation is a `ndarray` with shape `(11,)` where the elements + However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements correspond to the following: | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | ------------------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | z-coordinate of the top (height of hopper) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the top | -Inf | Inf | rooty | hinge | angle (rad) | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | + | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | velocity of the x-coordinate of the top | -Inf | Inf | rootx | slide | velocity (m/s) | - | 6 | velocity of the z-coordinate (height) of the top | -Inf | Inf | rootz | slide | velocity (m/s) | - | 7 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 5 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | + | 6 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | + | 7 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | diff --git a/gymnasium/envs/mujoco/humanoid_v4.py b/gymnasium/envs/mujoco/humanoid_v4.py index bc6ecddd7..341ac6c28 100644 --- a/gymnasium/envs/mujoco/humanoid_v4.py +++ b/gymnasium/envs/mujoco/humanoid_v4.py @@ -44,31 +44,32 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | ## Observation Space - Observations consist of positional values of different body parts of the Humanoid, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. By default, observations do not include the x- and y-coordinates of the torso. These may be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-1, 1, (378,), float64)` where the first two observations + In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations represent the x- and y-coordinates of the torso. Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - However, by default, the observation is a `Box(-1, 1, (376,), float64)`. The elements correspond to the following: + However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following: | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | diff --git a/gymnasium/envs/mujoco/humanoidstandup_v4.py b/gymnasium/envs/mujoco/humanoidstandup_v4.py index 2cee9058d..797c37ecd 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v4.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v4.py @@ -52,16 +52,23 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | ## Observation Space + Observations consist of positional values of different body parts of the Humanoid, + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. - The state space consists of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + By default, observations do not include the x- and y-coordinates of the torso. These may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations + represent the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates + will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - **Note:** The x- and y-coordinates of the torso are being omitted to produce position-agnostic behavior in policies - - The observation is a `ndarray` with shape `(376,)` where the elements correspond to the following: + However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following: | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | @@ -96,21 +103,20 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 35 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 36 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 37 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 38 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 39 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 40 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 41 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 42 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 43 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 44 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 45 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | - + | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | + | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | + | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | + | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | + | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | + | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | + | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | + | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | + | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | + | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | + | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | Additionally, after all the positional and velocity based values in the table, - the state_space consists of (in order): + the observation contains (in order): - *cinert:* Mass and inertia of a single rigid body relative to the center of mass (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) and hence adds to another 140 elements in the state space. @@ -120,8 +126,55 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. - *cfrc_ext:* This is the center of mass based external force on the body. It has shape 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the number - of degrees of freedom (*= dim(qvel)*) + where *nbody* stands for the number of bodies in the robot and *nv* stands for the + number of degrees of freedom (*= dim(qvel)*) + + The body parts are: + + | id (for `v2`,`v3`,`v4`) | body part | + | --- | ------------ | + | 0 | worldBody (note: all values are constant 0) | + | 1 | torso | + | 2 | lwaist | + | 3 | pelvis | + | 4 | right_thigh | + | 5 | right_sin | + | 6 | right_foot | + | 7 | left_thigh | + | 8 | left_sin | + | 9 | left_foot | + | 10 | right_upper_arm | + | 11 | right_lower_arm | + | 12 | left_upper_arm | + | 13 | left_lower_arm | + + The joints are: + + | id (for `v2`,`v3`,`v4`) | joint | + | --- | ------------ | + | 0 | root | + | 1 | root | + | 2 | root | + | 3 | root | + | 4 | root | + | 5 | root | + | 6 | abdomen_z | + | 7 | abdomen_y | + | 8 | abdomen_x | + | 9 | right_hip_x | + | 10 | right_hip_z | + | 11 | right_hip_y | + | 12 | right_knee | + | 13 | left_hip_x | + | 14 | left_hiz_z | + | 15 | left_hip_y | + | 16 | left_knee | + | 17 | right_shoulder1 | + | 18 | right_shoulder2 | + | 19 | right_elbow| + | 20 | left_shoulder1 | + | 21 | left_shoulder2 | + | 22 | left_elfbow | The (x,y,z) coordinates are translational DOFs while the orientations are rotational DOFs expressed as quaternions. One can read more about free joints on the diff --git a/gymnasium/envs/mujoco/pusher_v4.py b/gymnasium/envs/mujoco/pusher_v4.py index a2929cd1e..55ba517e0 100644 --- a/gymnasium/envs/mujoco/pusher_v4.py +++ b/gymnasium/envs/mujoco/pusher_v4.py @@ -41,7 +41,7 @@ class PusherEnv(MujocoEnv, utils.EzPickle): - The coordinates of the object to be moved - The coordinates of the goal position - The observation is a `ndarray` with shape `(23,)` where the elements correspond to the table below. + The observation is a `Box(-Inf, Inf, (23,), float64)` where the elements correspond to the table below. An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the same as human joints. diff --git a/gymnasium/envs/mujoco/reacher_v4.py b/gymnasium/envs/mujoco/reacher_v4.py index b701cff3f..49c6d4892 100644 --- a/gymnasium/envs/mujoco/reacher_v4.py +++ b/gymnasium/envs/mujoco/reacher_v4.py @@ -23,7 +23,6 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | ## Observation Space - Observations consist of - The cosine of the angles of the two arms @@ -32,7 +31,7 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): - The angular velocities of the arms - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0) - The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following: + The observation is a `Box(-Inf, Inf, (11,), float64)` where the elements correspond to the following: | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | ---------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | @@ -40,8 +39,8 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | | 2 | sine of the angle of the first arm | -Inf | Inf | sin(joint0) | hinge | unitless | | 3 | sine of the angle of the second arm | -Inf | Inf | sin(joint1) | hinge | unitless | - | 4 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | - | 5 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | + | 4 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | + | 5 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | | 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) | | 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) | | 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | diff --git a/gymnasium/envs/mujoco/swimmer_v4.py b/gymnasium/envs/mujoco/swimmer_v4.py index e2fe8b2c8..a936ca71a 100644 --- a/gymnasium/envs/mujoco/swimmer_v4.py +++ b/gymnasium/envs/mujoco/swimmer_v4.py @@ -44,19 +44,18 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): | 1 | Torque applied on the second rotor | -1 | 1 | motor2_rot | hinge | torque (N m) | ## Observation Space - By default, observations consists of: * θi: angle of part *i* with respect to the *x* axis * θi': its derivative with respect to time (angular velocity) In the default case, observations do not include the x- and y-coordinates of the front tip. These may be included by passing `exclude_current_positions_from_observation=False` during construction. - Then, the observation space will have 10 dimensions where the first two dimensions + Then, the observation space will be `Box(-Inf, Inf, (10,), float64)` where the first two observations represent the x- and y-coordinates of the front tip. Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - By default, the observation is a `ndarray` with shape `(8,)` where the elements correspond to the following: + By default, the observation is a `Box(-Inf, Inf, (8,), float64)` where the elements correspond to the following: | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | @@ -67,7 +66,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): | 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | | 5 | angular velocity of front tip | -Inf | Inf | free_body_rot | hinge | angular velocity (rad/s) | | 6 | angular velocity of first rotor | -Inf | Inf | motor1_rot | hinge | angular velocity (rad/s) | - | 7 | angular velocity of second rotor | -Inf | Inf | motor2_rot | hinge | angular velocity (rad/s) | + | 7 | angular velocity of second rotor | -Inf | Inf | motor2_rot | hinge | angular velocity (rad/s) | ## Rewards The reward consists of two parts: diff --git a/gymnasium/envs/mujoco/walker2d_v4.py b/gymnasium/envs/mujoco/walker2d_v4.py index 83756b190..fd2b14f28 100644 --- a/gymnasium/envs/mujoco/walker2d_v4.py +++ b/gymnasium/envs/mujoco/walker2d_v4.py @@ -15,134 +15,135 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): """ - ## Description - - This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment - by adding another set of legs making it possible for the robot to walk forward instead of - hop. Like other Mujoco environments, this environment aims to increase the number of independent state - and control variables as compared to the classic control environments. The walker is a - two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top - (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs - in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. - The goal is to walk in the in the forward (right) - direction by applying torques on the six hinges connecting the seven body parts. - - ## Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | - | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | - | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the walker, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the top. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will have 18 dimensions where the first dimension - represent the x-coordinates of the top of the walker. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - of the top will be returned in `info` with key `"x_position"`. - - By default, observation is a `ndarray` with shape `(17,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | - | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | - | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | - | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | velocity of the z-coordinate (height) of the rorso | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angular velocity of the angle of the top | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | - | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | - | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`, - - *forward_reward*: A reward of walking forward which is measured as - *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. - *dt* is the time between actions and is dependeent on the frame_skip parameter - (default is 4), where the frametime is 0.002 - making the default - *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (positive x direction). - - *ctrl_cost*: A cost for penalising the walker if it - takes actions that are too large. It is measured as - *`ctrl_cost_weight` * sum(action2)* where *`ctrl_cost_weight`* is - a parameter set for the control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ## Episode End - The walker is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` - 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The walker is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4') - ``` - - v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | - | `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | - | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the top of the walker must be in this range to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) + ## Description + + This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment + by adding another set of legs making it possible for the robot to walk forward instead of + hop. Like other Mujoco environments, this environment aims to increase the number of independent state + and control variables as compared to the classic control environments. The walker is a + two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top + (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs + in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. + The goal is to walk in the in the forward (right) + direction by applying torques on the six hinges connecting the seven body parts. + + ## Action Space + The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | + | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | + | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | + + ## Observation Space + Observations consist of positional values of different body parts of the walker, + followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + + By default, observations do not include the x-coordinate of the torso. It may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be `Box(-Inf, Inf, (18,), float64)` where the first observation + represent the x-coordinates of the torso of the walker. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate + of the torso will be returned in `info` with key `"x_position"`. + dimension + By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | + | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | + | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | + | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | + | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | + | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | + | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | + + ## Rewards + The reward consists of three parts: + - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`, + - *forward_reward*: A reward of walking forward which is measured as + *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. + *dt* is the time between actions and is dependeent on the frame_skip parameter + (default is 4), where the frametime is 0.002 - making the default + *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (positive x direction). + - *ctrl_cost*: A cost for penalising the walker if it + takes actions that are too large. It is measured as + *`ctrl_cost_weight` * sum(action2)* where *`ctrl_cost_weight`* is + a parameter set for the control and has a default value of 0.001 + + The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms + + ## Starting State + All observations start in state + (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. + + ## Episode End + The walker is said to be unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite + 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` + 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` + + If `terminate_when_unhealthy=True` is passed during construction (which is the default), + the episode ends when any of the following happens: + + 1. Truncation: The episode duration reaches a 1000 timesteps + 2. Termination: The walker is unhealthy + + If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. + + ## Arguments + + No additional arguments are currently supported in v2 and lower. + + ```python + import gymnasium as gym + env = gym.make('Walker2d-v4') + ``` + + v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('Walker2d-v4', ctrl_cost_weight=0.1, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | + | `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | + | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the torso of the walker must be in this range to be considered healthy | + | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | + | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + + ## Version History + + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) """ metadata = { From f58bb5e9520d4e95b2c09a3b033dc4c3f6797d5f Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 9 May 2023 16:21:00 +0300 Subject: [PATCH 05/53] typofix --- gymnasium/envs/mujoco/half_cheetah_v4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gymnasium/envs/mujoco/half_cheetah_v4.py b/gymnasium/envs/mujoco/half_cheetah_v4.py index 13ba74669..940426f76 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v4.py +++ b/gymnasium/envs/mujoco/half_cheetah_v4.py @@ -46,7 +46,7 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): By default, observations do not include the cheetah's `rootx`. It may be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation will be a `Box(-Inf, Inf, (18,), float64)` where the first element + In that case, the observation space will be a `Box(-Inf, Inf, (18,), float64)` where the first element represents the `rootx`/ Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the will be returned in `info` with key `"x_position"`. From 7a4bc32008e1865eaa286bbdfd1c44a929195163 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 9 May 2023 16:28:31 +0300 Subject: [PATCH 06/53] typo fix --- gymnasium/envs/mujoco/half_cheetah_v4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gymnasium/envs/mujoco/half_cheetah_v4.py b/gymnasium/envs/mujoco/half_cheetah_v4.py index 940426f76..b6c59f72e 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v4.py +++ b/gymnasium/envs/mujoco/half_cheetah_v4.py @@ -47,7 +47,7 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): By default, observations do not include the cheetah's `rootx`. It may be included by passing `exclude_current_positions_from_observation=False` during construction. In that case, the observation space will be a `Box(-Inf, Inf, (18,), float64)` where the first element - represents the `rootx`/ + represents the `rootx`. Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the will be returned in `info` with key `"x_position"`. From 24186314c12362dcdd37778f2ca418705dce68de Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 9 May 2023 20:14:25 +0300 Subject: [PATCH 07/53] update following @pseudo-rnd-thoughts reviews --- gymnasium/envs/mujoco/ant_v4.py | 24 +- gymnasium/envs/mujoco/half_cheetah_v4.py | 2 +- gymnasium/envs/mujoco/hopper_v4.py | 22 +- gymnasium/envs/mujoco/humanoid_v4.py | 86 +++---- gymnasium/envs/mujoco/humanoidstandup_v4.py | 86 +++---- gymnasium/envs/mujoco/swimmer_v4.py | 2 + gymnasium/envs/mujoco/walker2d_v4.py | 258 ++++++++++---------- 7 files changed, 241 insertions(+), 239 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v4.py b/gymnasium/envs/mujoco/ant_v4.py index 9775aad59..4bec72528 100644 --- a/gymnasium/envs/mujoco/ant_v4.py +++ b/gymnasium/envs/mujoco/ant_v4.py @@ -53,8 +53,6 @@ class AntEnv(MujocoEnv, utils.EzPickle): | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | @@ -82,6 +80,8 @@ class AntEnv(MujocoEnv, utils.EzPickle): | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | | 26 | angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | If version < `v4` or `use_contact_forces` is `True` then the observation space is extended by 14*6 = 84 elements, which are contact forces @@ -90,16 +90,16 @@ class AntEnv(MujocoEnv, utils.EzPickle): | id (for `v2`, `v3`, `v4)` | body parts | | --- | ------------ | - | 0 | worldBody (note: forces are always full of zeros) | - | 1 | torso | - | 2 | front_left_leg | - | 3 | aux_1 (front left leg) | - | 4 | ankle_1 (front left leg) | - | 5 | front_right_leg | - | 6 | aux_2 (front right leg) | - | 7 | ankle_2 (front right leg) | - | 8 | back_leg (back left leg) | - | 9 | aux_3 (back left leg) | + | 0 | worldbody (note: forces are always full of zeros) | + | 1 | torso | + | 2 | front_left_leg | + | 3 | aux_1 (front left leg) | + | 4 | ankle_1 (front left leg) | + | 5 | front_right_leg | + | 6 | aux_2 (front right leg) | + | 7 | ankle_2 (front right leg) | + | 8 | back_leg (back left leg) | + | 9 | aux_3 (back left leg) | | 10 | ankle_3 (back left leg) | | 11 | right_back_leg | | 12 | aux_4 (back right leg) | diff --git a/gymnasium/envs/mujoco/half_cheetah_v4.py b/gymnasium/envs/mujoco/half_cheetah_v4.py index b6c59f72e..f90d59065 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v4.py +++ b/gymnasium/envs/mujoco/half_cheetah_v4.py @@ -55,7 +55,6 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | - | excluded | x-coordinate of the front tip | -Inf | Inf | rootx | slide | position (m) | | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | @@ -73,6 +72,7 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): | 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angular velocity (rad/s) | | 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | | 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the front tip | -Inf | Inf | rootx | slide | position (m) | ## Rewards The reward consists of two parts: diff --git a/gymnasium/envs/mujoco/hopper_v4.py b/gymnasium/envs/mujoco/hopper_v4.py index e399aa0c1..ef6cc0741 100644 --- a/gymnasium/envs/mujoco/hopper_v4.py +++ b/gymnasium/envs/mujoco/hopper_v4.py @@ -51,20 +51,20 @@ class HopperEnv(MujocoEnv, utils.EzPickle): However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ------------------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | - | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | - | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | | 5 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | | 6 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | | 7 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | ## Rewards diff --git a/gymnasium/envs/mujoco/humanoid_v4.py b/gymnasium/envs/mujoco/humanoid_v4.py index 341ac6c28..5b3d9df1d 100644 --- a/gymnasium/envs/mujoco/humanoid_v4.py +++ b/gymnasium/envs/mujoco/humanoid_v4.py @@ -54,8 +54,8 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): ## Observation Space Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. By default, observations do not include the x- and y-coordinates of the torso. These may be included by passing `exclude_current_positions_from_observation=False` during construction. @@ -68,8 +68,6 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | @@ -115,6 +113,8 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | Additionally, after all the positional and velocity based values in the table, the observation contains (in order): @@ -133,49 +133,49 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): The body parts are: | id (for `v2`,`v3`,`v4`) | body part | - | --- | ------------ | - | 0 | worldBody (note: all values are constant 0) | - | 1 | torso | - | 2 | lwaist | - | 3 | pelvis | - | 4 | right_thigh | - | 5 | right_sin | - | 6 | right_foot | - | 7 | left_thigh | - | 8 | left_sin | - | 9 | left_foot | - | 10 | right_upper_arm | - | 11 | right_lower_arm | - | 12 | left_upper_arm | - | 13 | left_lower_arm | + | --- | ------------ | + | 0 | worldBody (note: all values are constant 0) | + | 1 | torso | + | 2 | lwaist | + | 3 | pelvis | + | 4 | right_thigh | + | 5 | right_sin | + | 6 | right_foot | + | 7 | left_thigh | + | 8 | left_sin | + | 9 | left_foot | + | 10 | right_upper_arm | + | 11 | right_lower_arm | + | 12 | left_upper_arm | + | 13 | left_lower_arm | The joints are: | id (for `v2`,`v3`,`v4`) | joint | - | --- | ------------ | - | 0 | root | - | 1 | root | - | 2 | root | - | 3 | root | - | 4 | root | - | 5 | root | - | 6 | abdomen_z | - | 7 | abdomen_y | - | 8 | abdomen_x | - | 9 | right_hip_x | - | 10 | right_hip_z | - | 11 | right_hip_y | - | 12 | right_knee | - | 13 | left_hip_x | - | 14 | left_hiz_z | - | 15 | left_hip_y | - | 16 | left_knee | - | 17 | right_shoulder1 | - | 18 | right_shoulder2 | - | 19 | right_elbow| - | 20 | left_shoulder1 | - | 21 | left_shoulder2 | - | 22 | left_elfbow | + | --- | ------------ | + | 0 | root | + | 1 | root | + | 2 | root | + | 3 | root | + | 4 | root | + | 5 | root | + | 6 | abdomen_z | + | 7 | abdomen_y | + | 8 | abdomen_x | + | 9 | right_hip_x | + | 10 | right_hip_z | + | 11 | right_hip_y | + | 12 | right_knee | + | 13 | left_hip_x | + | 14 | left_hiz_z | + | 15 | left_hip_y | + | 16 | left_knee | + | 17 | right_shoulder1 | + | 18 | right_shoulder2 | + | 19 | right_elbow| + | 20 | left_shoulder1 | + | 21 | left_shoulder2 | + | 22 | left_elfbow | The (x,y,z) coordinates are translational DOFs while the orientations are rotational DOFs expressed as quaternions. One can read more about free joints on the diff --git a/gymnasium/envs/mujoco/humanoidstandup_v4.py b/gymnasium/envs/mujoco/humanoidstandup_v4.py index 797c37ecd..40e791c9b 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v4.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v4.py @@ -53,8 +53,8 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): ## Observation Space Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. By default, observations do not include the x- and y-coordinates of the torso. These may be included by passing `exclude_current_positions_from_observation=False` during construction. @@ -67,8 +67,6 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | @@ -114,6 +112,8 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | Additionally, after all the positional and velocity based values in the table, the observation contains (in order): @@ -132,49 +132,49 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): The body parts are: | id (for `v2`,`v3`,`v4`) | body part | - | --- | ------------ | - | 0 | worldBody (note: all values are constant 0) | - | 1 | torso | - | 2 | lwaist | - | 3 | pelvis | - | 4 | right_thigh | - | 5 | right_sin | - | 6 | right_foot | - | 7 | left_thigh | - | 8 | left_sin | - | 9 | left_foot | - | 10 | right_upper_arm | - | 11 | right_lower_arm | - | 12 | left_upper_arm | - | 13 | left_lower_arm | + | --- | ------------ | + | 0 | worldBody (note: all values are constant 0) | + | 1 | torso | + | 2 | lwaist | + | 3 | pelvis | + | 4 | right_thigh | + | 5 | right_sin | + | 6 | right_foot | + | 7 | left_thigh | + | 8 | left_sin | + | 9 | left_foot | + | 10 | right_upper_arm | + | 11 | right_lower_arm | + | 12 | left_upper_arm | + | 13 | left_lower_arm | The joints are: | id (for `v2`,`v3`,`v4`) | joint | - | --- | ------------ | - | 0 | root | - | 1 | root | - | 2 | root | - | 3 | root | - | 4 | root | - | 5 | root | - | 6 | abdomen_z | - | 7 | abdomen_y | - | 8 | abdomen_x | - | 9 | right_hip_x | - | 10 | right_hip_z | - | 11 | right_hip_y | - | 12 | right_knee | - | 13 | left_hip_x | - | 14 | left_hiz_z | - | 15 | left_hip_y | - | 16 | left_knee | - | 17 | right_shoulder1 | - | 18 | right_shoulder2 | - | 19 | right_elbow| - | 20 | left_shoulder1 | - | 21 | left_shoulder2 | - | 22 | left_elfbow | + | --- | ------------ | + | 0 | root | + | 1 | root | + | 2 | root | + | 3 | root | + | 4 | root | + | 5 | root | + | 6 | abdomen_z | + | 7 | abdomen_y | + | 8 | abdomen_x | + | 9 | right_hip_x | + | 10 | right_hip_z | + | 11 | right_hip_y | + | 12 | right_knee | + | 13 | left_hip_x | + | 14 | left_hiz_z | + | 15 | left_hip_y | + | 16 | left_knee | + | 17 | right_shoulder1 | + | 18 | right_shoulder2 | + | 19 | right_elbow| + | 20 | left_shoulder1 | + | 21 | left_shoulder2 | + | 22 | left_elfbow | The (x,y,z) coordinates are translational DOFs while the orientations are rotational DOFs expressed as quaternions. One can read more about free joints on the diff --git a/gymnasium/envs/mujoco/swimmer_v4.py b/gymnasium/envs/mujoco/swimmer_v4.py index a936ca71a..6db5f4f08 100644 --- a/gymnasium/envs/mujoco/swimmer_v4.py +++ b/gymnasium/envs/mujoco/swimmer_v4.py @@ -67,6 +67,8 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): | 5 | angular velocity of front tip | -Inf | Inf | free_body_rot | hinge | angular velocity (rad/s) | | 6 | angular velocity of first rotor | -Inf | Inf | motor1_rot | hinge | angular velocity (rad/s) | | 7 | angular velocity of second rotor | -Inf | Inf | motor2_rot | hinge | angular velocity (rad/s) | + | excluded | position of the tip along the x-axis | -Inf | Inf | slider1 | slide | position (m) | + | excluded | position of the tip along the y-axis | -Inf | Inf | slider2 | slide | position (m) | ## Rewards The reward consists of two parts: diff --git a/gymnasium/envs/mujoco/walker2d_v4.py b/gymnasium/envs/mujoco/walker2d_v4.py index fd2b14f28..204f17644 100644 --- a/gymnasium/envs/mujoco/walker2d_v4.py +++ b/gymnasium/envs/mujoco/walker2d_v4.py @@ -15,135 +15,135 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): """ - ## Description - - This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment - by adding another set of legs making it possible for the robot to walk forward instead of - hop. Like other Mujoco environments, this environment aims to increase the number of independent state - and control variables as compared to the classic control environments. The walker is a - two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top - (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs - in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. - The goal is to walk in the in the forward (right) - direction by applying torques on the six hinges connecting the seven body parts. - - ## Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | - | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | - | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the walker, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the torso. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be `Box(-Inf, Inf, (18,), float64)` where the first observation - represent the x-coordinates of the torso of the walker. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - of the torso will be returned in `info` with key `"x_position"`. - dimension - By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | - | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | - | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | - | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | - | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | - | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | - | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`, - - *forward_reward*: A reward of walking forward which is measured as - *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. - *dt* is the time between actions and is dependeent on the frame_skip parameter - (default is 4), where the frametime is 0.002 - making the default - *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (positive x direction). - - *ctrl_cost*: A cost for penalising the walker if it - takes actions that are too large. It is measured as - *`ctrl_cost_weight` * sum(action2)* where *`ctrl_cost_weight`* is - a parameter set for the control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ## Episode End - The walker is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` - 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The walker is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4') - ``` - - v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | - | `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | - | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the torso of the walker must be in this range to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) + ## Description + + This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment + by adding another set of legs making it possible for the robot to walk forward instead of + hop. Like other Mujoco environments, this environment aims to increase the number of independent state + and control variables as compared to the classic control environments. The walker is a + two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top + (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs + in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. + The goal is to walk in the in the forward (right) + direction by applying torques on the six hinges connecting the seven body parts. + + ## Action Space + The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | + | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | + | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | + + ## Observation Space + Observations consist of positional values of different body parts of the walker, + followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + + By default, observations do not include the x-coordinate of the torso. It may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be `Box(-Inf, Inf, (18,), float64)` where the first observation + represent the x-coordinates of the torso of the walker. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate + of the torso will be returned in `info` with key `"x_position"`. + + By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | + | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | + | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | + | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | + | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | + | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | + | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | + + ## Rewards + The reward consists of three parts: + - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`, + - *forward_reward*: A reward of walking forward which is measured as + *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. + *dt* is the time between actions and is dependeent on the frame_skip parameter + (default is 4), where the frametime is 0.002 - making the default + *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (positive x direction). + - *ctrl_cost*: A cost for penalising the walker if it + takes actions that are too large. It is measured as + *`ctrl_cost_weight` * sum(action2)* where *`ctrl_cost_weight`* is + a parameter set for the control and has a default value of 0.001 + + The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms + + ## Starting State + All observations start in state + (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. + + ## Episode End + The walker is said to be unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite + 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` + 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` + + If `terminate_when_unhealthy=True` is passed during construction (which is the default), + the episode ends when any of the following happens: + + 1. Truncation: The episode duration reaches a 1000 timesteps + 2. Termination: The walker is unhealthy + + If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. + + ## Arguments + + No additional arguments are currently supported in v2 and lower. + + ```python + import gymnasium as gym + env = gym.make('Walker2d-v4') + ``` + + v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('Walker2d-v4', ctrl_cost_weight=0.1, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | + | `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | + | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the torso of the walker must be in this range to be considered healthy | + | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | + | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + + ## Version History + + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) """ metadata = { From 7639d18ced28a06d641135cfc91df4b5fa6487d3 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas <30759571+Kallinteris-Andreas@users.noreply.github.com> Date: Fri, 16 Jun 2023 14:03:06 +0000 Subject: [PATCH 08/53] refactor `tests/env/test_mojoco.py` -> `tests/env/mujoco/test_mojoco_v3.py` --- tests/envs/{test_mujoco.py => mujoco/test_mujoco_v3.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/envs/{test_mujoco.py => mujoco/test_mujoco_v3.py} (100%) diff --git a/tests/envs/test_mujoco.py b/tests/envs/mujoco/test_mujoco_v3.py similarity index 100% rename from tests/envs/test_mujoco.py rename to tests/envs/mujoco/test_mujoco_v3.py From ab931b7796b0e5e8a18d657bbd7ca7caadff7a20 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 27 Jun 2023 21:57:56 +0300 Subject: [PATCH 09/53] add files --- gymnasium/envs/__init__.py | 80 ++- gymnasium/envs/mujoco/ant_v5.py | 427 ++++++++++++++ gymnasium/envs/mujoco/assets/hopper_v5.xml | 53 ++ gymnasium/envs/mujoco/assets/walker2d_v5.xml | 68 +++ .../envs/mujoco/assets/walker2d_v5_old.xml | 68 +++ gymnasium/envs/mujoco/half_cheetah_v5.py | 268 +++++++++ gymnasium/envs/mujoco/hopper_v5.py | 334 +++++++++++ gymnasium/envs/mujoco/humanoid_v5.py | 524 ++++++++++++++++++ gymnasium/envs/mujoco/humanoidstandup_v5.py | 469 ++++++++++++++++ .../mujoco/inverted_double_pendulum_v5.py | 223 ++++++++ gymnasium/envs/mujoco/inverted_pendulum_v5.py | 178 ++++++ gymnasium/envs/mujoco/pusher_v5.py | 245 ++++++++ gymnasium/envs/mujoco/reacher_v5.py | 219 ++++++++ gymnasium/envs/mujoco/swimmer_v5.py | 259 +++++++++ gymnasium/envs/mujoco/walker2d_v5.py | 328 +++++++++++ tests/envs/mujoco/test_mujoco_v5.py | 471 ++++++++++++++++ 16 files changed, 4213 insertions(+), 1 deletion(-) create mode 100644 gymnasium/envs/mujoco/ant_v5.py create mode 100644 gymnasium/envs/mujoco/assets/hopper_v5.xml create mode 100644 gymnasium/envs/mujoco/assets/walker2d_v5.xml create mode 100644 gymnasium/envs/mujoco/assets/walker2d_v5_old.xml create mode 100644 gymnasium/envs/mujoco/half_cheetah_v5.py create mode 100644 gymnasium/envs/mujoco/hopper_v5.py create mode 100644 gymnasium/envs/mujoco/humanoid_v5.py create mode 100644 gymnasium/envs/mujoco/humanoidstandup_v5.py create mode 100644 gymnasium/envs/mujoco/inverted_double_pendulum_v5.py create mode 100644 gymnasium/envs/mujoco/inverted_pendulum_v5.py create mode 100644 gymnasium/envs/mujoco/pusher_v5.py create mode 100644 gymnasium/envs/mujoco/reacher_v5.py create mode 100644 gymnasium/envs/mujoco/swimmer_v5.py create mode 100644 gymnasium/envs/mujoco/walker2d_v5.py create mode 100644 tests/envs/mujoco/test_mujoco_v5.py diff --git a/gymnasium/envs/__init__.py b/gymnasium/envs/__init__.py index c41b76b9c..e28a0cb35 100644 --- a/gymnasium/envs/__init__.py +++ b/gymnasium/envs/__init__.py @@ -180,7 +180,7 @@ # Mujoco # ---------------------------------------- -# 2D +# manipulation register( id="Reacher-v2", @@ -196,6 +196,13 @@ reward_threshold=-3.75, ) +register( + id="Reacher-v5", + entry_point="gymnasium.envs.mujoco.reacher_v5:ReacherEnv", + max_episode_steps=50, + reward_threshold=-3.75, +) + register( id="Pusher-v2", entry_point="gymnasium.envs.mujoco:PusherEnv", @@ -210,6 +217,15 @@ reward_threshold=0.0, ) +register( + id="Pusher-v5", + entry_point="gymnasium.envs.mujoco.pusher_v5:PusherEnv", + max_episode_steps=100, + reward_threshold=0.0, +) + +# balance + register( id="InvertedPendulum-v2", entry_point="gymnasium.envs.mujoco:InvertedPendulumEnv", @@ -224,6 +240,13 @@ reward_threshold=950.0, ) +register( + id="InvertedPendulum-v5", + entry_point="gymnasium.envs.mujoco.inverted_pendulum_v5:InvertedPendulumEnv", + max_episode_steps=1000, + reward_threshold=950.0, +) + register( id="InvertedDoublePendulum-v2", entry_point="gymnasium.envs.mujoco:InvertedDoublePendulumEnv", @@ -238,6 +261,15 @@ reward_threshold=9100.0, ) +register( + id="InvertedDoublePendulum-v5", + entry_point="gymnasium.envs.mujoco.inverted_double_pendulum_v5:InvertedDoublePendulumEnv", + max_episode_steps=1000, + reward_threshold=9100.0, +) + +# runners + register( id="HalfCheetah-v2", entry_point="gymnasium.envs.mujoco:HalfCheetahEnv", @@ -259,6 +291,13 @@ reward_threshold=4800.0, ) +register( + id="HalfCheetah-v5", + entry_point="gymnasium.envs.mujoco.half_cheetah_v5:HalfCheetahEnv", + max_episode_steps=1000, + reward_threshold=4800.0, +) + register( id="Hopper-v2", entry_point="gymnasium.envs.mujoco:HopperEnv", @@ -280,6 +319,13 @@ reward_threshold=3800.0, ) +register( + id="Hopper-v5", + entry_point="gymnasium.envs.mujoco.hopper_v5:HopperEnv", + max_episode_steps=1000, + reward_threshold=3800.0, +) + register( id="Swimmer-v2", entry_point="gymnasium.envs.mujoco:SwimmerEnv", @@ -301,6 +347,13 @@ reward_threshold=360.0, ) +register( + id="Swimmer-v5", + entry_point="gymnasium.envs.mujoco.swimmer_v5:SwimmerEnv", + max_episode_steps=1000, + reward_threshold=360.0, +) + register( id="Walker2d-v2", max_episode_steps=1000, @@ -319,6 +372,12 @@ entry_point="gymnasium.envs.mujoco.walker2d_v4:Walker2dEnv", ) +register( + id="Walker2d-v5", + max_episode_steps=1000, + entry_point="gymnasium.envs.mujoco.walker2d_v5:Walker2dEnv", +) + register( id="Ant-v2", entry_point="gymnasium.envs.mujoco:AntEnv", @@ -340,6 +399,13 @@ reward_threshold=6000.0, ) +register( + id="Ant-v5", + entry_point="gymnasium.envs.mujoco.ant_v5:AntEnv", + max_episode_steps=1000, + reward_threshold=6000.0, +) + register( id="Humanoid-v2", entry_point="gymnasium.envs.mujoco:HumanoidEnv", @@ -358,6 +424,12 @@ max_episode_steps=1000, ) +register( + id="Humanoid-v5", + entry_point="gymnasium.envs.mujoco.humanoid_v5:HumanoidEnv", + max_episode_steps=1000, +) + register( id="HumanoidStandup-v2", entry_point="gymnasium.envs.mujoco:HumanoidStandupEnv", @@ -370,6 +442,12 @@ max_episode_steps=1000, ) +register( + id="HumanoidStandup-v5", + entry_point="gymnasium.envs.mujoco.humanoidstandup_v5:HumanoidStandupEnv", + max_episode_steps=1000, +) + # --- For shimmy compatibility def _raise_shimmy_error(*args: Any, **kwargs: Any): diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py new file mode 100644 index 000000000..dcf2d3887 --- /dev/null +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -0,0 +1,427 @@ +__credits__ = ["Kallinteris-Andreas"] + +from typing import Dict, Tuple, Union + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "distance": 4.0, +} + + +class AntEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the environment introduced by Schulman, + Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control + Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438). + The ant is a 3D robot consisting of one torso (free rotational body) with + four legs attached to it with each leg having two body parts. The goal is to + coordinate the four legs to move in the forward (right) direction by applying + torques on the eight hinges connecting the two body parts of each leg and the torso + (nine body parts and eight hinges). + + + ## Action Space + The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | + | 1 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | + | 2 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | + | 3 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | + | 7 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | + + + ## Observation Space + Observations consist of positional values of different body parts of the ant, + followed by the velocities of those individual parts (their derivatives) with all + the positions ordered before all the velocities. + + By default, observations do not include the x- and y-coordinates of the ant's torso. These may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be a `Box(-Inf, Inf, (107,), float64)` where the first two observations + represent the x- and y- coordinates of the ant's torso. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates + of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + However, by default, observation Space is a `Box(-Inf, Inf, (105,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | + | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | + | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | + | 21 | angular velocity of angle between torso and front right link | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | + | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | + | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | + | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | + | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | + | 26 | angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + + + Additionally, after all the positional and velocity based values in the table, the observation contains: + - *cfrc_ext:* 13*6 = 78 elements, which are contact forces + (external forces - force x, y, z and torque x, y, z) applied to the + center of mass of each of the body parts. + + The body parts are: + + | id (for `v2`, `v3`, `v4)` | id (for `v5`) | body part | + | ---| --- | ------------ | + | 0 |excluded| worldbody (note: all values are constant 0) | + | 1 |0 | torso | + | 2 |1 | front_left_leg | + | 3 |2 | aux_1 (front left leg) | + | 4 |3 | ankle_1 (front left leg) | + | 5 |4 | front_right_leg | + | 6 |5 | aux_2 (front right leg) | + | 7 |6 | ankle_2 (front right leg) | + | 8 |7 | back_leg (back left leg) | + | 9 |8 | aux_3 (back left leg) | + | 10 |9 | ankle_3 (back left leg) | + | 11 |10 | right_back_leg | + | 12 |11 | aux_4 (back right leg) | + | 13 |12 | ankle_4 (back right leg) | + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + + **Note:** Ant-v4+ environment no longer has the following contact forces issue. + If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results + in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 + when using the Ant environment if you would like to report results with contact forces (if + contact forces are not used in your experiments, you can use version > 2.0). + + + ## Rewards + The reward consists of three parts: + - *healthy_reward*: + Every timestep that the ant is healthy (see definition in section "Episode Termination"), + it gets a reward of fixed value `healthy_reward`. + - *forward_reward*: + A reward of moving forward, + this reward would be positive if the Ant moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the `main_body` ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 5), + and `frametime` which is 0.01 - making the default $dt = 5 \times 0.01 = 0.05$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward for penalizing the Ant if it takes actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $0.5$). + - *contact_cost*: + A negative reward for penalizing the Ant if the external contact forces are too large. + $w_{contact} \times \\|F_{contact}\\|_2^2$, where + $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-4}$), + $F_{contact}$ are the external contact forces clipped by `contact_force_range` (see `cfrc_ext` section on observation). + + + The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. + + But if `use_contact_forces=false` on `v4` + The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. + + In either case `info` will also contain the individual reward terms. + + + ## Starting State + All observations start in state + (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range + of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise + with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for + stochasticity. Note that the initial z coordinate is intentionally selected + to be slightly high, thereby indicating a standing up ant. The initial orientation + is designed to make it face forward as well. + + + ## Episode End + #### Termination + If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Ant is unhealthy. + the Ant is unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite + 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0]) + + #### truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('Ant-v5', ctrl_cost_weight=0.5, ...) + ``` + + | Parameter | Type | Default |Description | + |--------------------------------------------|------------|--------------|-------------------------------| + |`xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model | + |`forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward)| + |`ctrl_cost_weight` | **float** | `0.5` | Weight for _ctrl_cost_ term (see section on reward) | + |`contact_cost_weight` | **float** | `5e-4` | Weight for _contact_cost_ term (see section on reward) | + |`healthy_reward` | **float** | `1` | Weight for _healthy_reward_ term (see section on reward) | + |`main_body` |**str|int** | `1`("torso") | Name or ID of the body, whose diplacement is used to calculate the *dx*/_forward_reward_ (useful for custom MuJoCo models)| + |`terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | + |`healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range | + |`contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* | + |`reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + |`exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + |`include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | + |`use_contact_forces` (`v4` only) | **bool** | `False` | If true, it extends the observation space by adding contact forces (see `Observation Space` section) and includes contact_cost to the reward function (see `Rewards` section) | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3, also removed contact forces from the default observation space (new variable `use_contact_forces=True` can restore them) + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file: str = "ant.xml", + frame_skip: int = 5, + default_camera_config: Dict = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1, + ctrl_cost_weight: float = 0.5, + contact_cost_weight: float = 5e-4, + healthy_reward: float = 1.0, + main_body: Union[int, str] = 1, + terminate_when_unhealthy: bool = True, + healthy_z_range: Tuple[float, float] = (0.2, 1.0), + contact_force_range: Tuple[float, float] = (-1.0, 1.0), + reset_noise_scale: float = 0.1, + exclude_current_positions_from_observation: bool = True, + include_cfrc_ext_in_observation: bool = True, + **kwargs + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + contact_cost_weight, + healthy_reward, + main_body, + terminate_when_unhealthy, + healthy_z_range, + contact_force_range, + reset_noise_scale, + exclude_current_positions_from_observation, + include_cfrc_ext_in_observation, + **kwargs + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._contact_cost_weight = contact_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._contact_force_range = contact_force_range + + self._main_body = main_body + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + self._include_cfrc_ext_in_observation = include_cfrc_ext_in_observation + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, # needs to be defined after + default_camera_config=default_camera_config, + **kwargs + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = self.data.qpos.size + self.data.qvel.size + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation + + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + "cfrc_ext": self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def contact_forces(self): + raw_contact_forces = self.data.cfrc_ext + min_value, max_value = self._contact_force_range + contact_forces = np.clip(raw_contact_forces, min_value, max_value) + return contact_forces + + @property + def contact_cost(self): + contact_cost = self._contact_cost_weight * np.sum( + np.square(self.contact_forces) + ) + return contact_cost + + @property + def is_healthy(self): + state = self.state_vector() + min_z, max_z = self._healthy_z_range + is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + # TODO remove after validation + assert terminated == ( + not self.is_healthy if self._terminate_when_unhealthy else False + ) + return terminated + + def step(self, action): + xy_position_before = self.data.body(self._main_body).xpos[:2].copy() + # TODO remove after validation + assert (xy_position_before == self.get_body_com("torso")[:2].copy()).all() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.data.body(self._main_body).xpos[:2].copy() + # TODO remove after validation + assert (xy_position_after == self.get_body_com("torso")[:2].copy()).all() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + forward_reward = x_velocity * self._forward_reward_weight + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + + costs = ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + costs += contact_cost + + terminated = self.terminated + observation = self._get_obs() + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_contact": -contact_cost, + "reward_survive": healthy_reward, + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + } + + reward = rewards - costs + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + if self._include_cfrc_ext_in_observation: + assert ( + self.contact_forces[0].flat.copy() == 0 + ).all() # TODO remove after validation + contact_force = self.contact_forces[1:].flat.copy() + assert len(contact_force) == 78 + return np.concatenate((position, velocity, contact_force)) + else: + return np.concatenate((position, velocity)) + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = ( + self.init_qvel + + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv) + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + } diff --git a/gymnasium/envs/mujoco/assets/hopper_v5.xml b/gymnasium/envs/mujoco/assets/hopper_v5.xml new file mode 100644 index 000000000..cc3fbc849 --- /dev/null +++ b/gymnasium/envs/mujoco/assets/hopper_v5.xml @@ -0,0 +1,53 @@ + + + + + + + + + diff --git a/gymnasium/envs/mujoco/assets/walker2d_v5.xml b/gymnasium/envs/mujoco/assets/walker2d_v5.xml new file mode 100644 index 000000000..12baef14b --- /dev/null +++ b/gymnasium/envs/mujoco/assets/walker2d_v5.xml @@ -0,0 +1,68 @@ + + + + + + + + diff --git a/gymnasium/envs/mujoco/assets/walker2d_v5_old.xml b/gymnasium/envs/mujoco/assets/walker2d_v5_old.xml new file mode 100644 index 000000000..180bfaec2 --- /dev/null +++ b/gymnasium/envs/mujoco/assets/walker2d_v5_old.xml @@ -0,0 +1,68 @@ + + + + + + + + diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py new file mode 100644 index 000000000..d0ead9924 --- /dev/null +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -0,0 +1,268 @@ +__credits__ = ["Kallinteris-Andreas", "Rushiv Arora"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "distance": 4.0, +} + + +class HalfCheetahEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the work by P. Wawrzyński in + ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf). + The HalfCheetah is a 2-dimensional robot consisting of 9 body parts and 8 + joints connecting them (including two paws). The goal is to apply a torque + on the joints to make the cheetah run forward (right) as fast as possible, + with a positive reward allocated based on the distance moved forward and a + negative reward allocated for moving backward. The torso and head of the + cheetah are fixed, and the torque can only be applied on the other 6 joints + over the front and back thighs (connecting to the torso), shins + (connecting to the thighs) and feet (connecting to the shins). + + + ## Action Space + The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | --- | --------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) | + | 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) | + | 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) | + | 3 | Torque applied on the front thigh rotor | -1 | 1 | fthigh | hinge | torque (N m) | + | 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) | + | 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) | + + + ## Observation Space + Observations consist of positional values of different body parts of the + cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + + By default, observations do not include the cheetah's `rootx`. It may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be a `Box(-Inf, Inf, (18,), float64)` where the first element + represents the `rootx`. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the + will be returned in `info` with key `"x_position"`. + + However, by default, the observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | + | 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) | + | 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) | + | 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) | + | 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) | + | 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) | + | 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) | + | 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) | + | 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) | + | 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angular velocity (rad/s) | + | 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | + | 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the front tip | -Inf | Inf | rootx | slide | position (m) | + + + ## Rewards + The reward consists of two parts: + - *forward_reward*: + A reward of moving forward, + this reward would be positive if the Half Cheetah moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the "tip" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 5), + and `frametime` which is 0.01 - making the default $dt = 5 \times 0.01 = 0.05$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward for penalizing the Half Cheetah if it takes actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $0.1$). + + The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost*, + and `info` will also contain the individual reward terms. + + + ## Starting State + All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the + initial state for stochasticity. As seen before, the first 8 values in the + state are positional and the last 9 values are velocity. A uniform noise in + the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard + normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the + initial velocity values of all zeros. + + + ## Episode End + #### Termination + The Half Cheetah never terminates. + + #### Truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('HalfCheetah-v5', ctrl_cost_weight=0.1, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"half_cheetah.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ weight (see section on reward) | + | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen). + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="half_cheetah.xml", + frame_skip=5, + default_camera_config=DEFAULT_CAMERA_CONFIG, + forward_reward_weight=1.0, + ctrl_cost_weight=0.1, + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 1 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 1 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + info = { + "x_position": x_position_after, + "x_velocity": x_velocity, + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + } + + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = ( + self.init_qvel + + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv) + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + } diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py new file mode 100644 index 000000000..1d808a324 --- /dev/null +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -0,0 +1,334 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 2, + "distance": 3.0, + "lookat": np.array((0.0, 0.0, 1.15)), + "elevation": -20.0, +} + + +class HopperEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the work done by Erez, Tassa, and Todorov in + ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to + increase the number of independent state and control variables as compared to + the classic control environments. The hopper is a two-dimensional + one-legged figure that consist of four main body parts - the torso at the + top, the thigh in the middle, the leg in the bottom, and a single foot on + which the entire body rests. The goal is to make hops that move in the + forward (right) direction by applying torques on the three hinges + connecting the four body parts. + + + ## Action Space + The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + + + ## Observation Space + Observations consist of positional values of different body parts of the + hopper, followed by the velocities of those individual parts + (their derivatives) with all the positions ordered before all the velocities. + + By default, observations do not include the x-coordinate of the hopper. It may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be `Box(-Inf, Inf, (12,), float64)` where the first observation + represents the x-coordinate of the hopper. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate + will be returned in `info` with key `"x_position"`. + + However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements + correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | + | 6 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | + | 7 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | + + + ## Rewards + The reward consists of three parts: + - *healthy_reward*: + Every timestep that the Hopper is healthy (see definition in section "Episode Termination"), + it gets a reward of fixed value `healthy_reward`. + - *forward_reward*: + A reward of moving forward, + this reward would be positive if the Hopper moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the "torso" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 4), + and `frametime` which is 0.002 - making the default $dt = 4 \times 0.002 = 0.008$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward for penalizing the Hopper if it takes actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). + + The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*, + and `info` will also contain the individual reward terms. + + ## Starting State + All observations start in state + (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise + in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. + + + ## Episode End + #### Termination + If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Hopper is unhealthy. + The Hopper is unhealthy if any of the following happens: + + 1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range` + 2. The height of the hopper (`observation[0]` if `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen) + 3. The angle of the torso (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range` + + #### Truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('Hopper-v5', ctrl_cost_weight=1e-3, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"hopper_v5.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctrl_cost_ reward (see section on reward) | + | `healthy_reward` | **float** | `1` | Weight for _healthy_reward_ reward (see section on reward) | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the hopper is no longer healthy | + | `healthy_state_range` | **tuple** | `(-100, 100)` | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy | + | `healthy_z_range` | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy | + | `healthy_angle_range` | **tuple** | `(-0.2, 0.2)` | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy | + | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0). + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="hopper_v5.xml", + frame_skip=4, + default_camera_config=DEFAULT_CAMERA_CONFIG, + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.7, float("inf")), + healthy_angle_range=(-0.2, 0.2), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + healthy_reward, + terminate_when_unhealthy, + healthy_state_range, + healthy_z_range, + healthy_angle_range, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_state_range = healthy_state_range + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 1 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 1 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.data.qpos[1:3] + state = self.state_vector()[2:] + + min_state, max_state = self._healthy_state_range + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_state = np.all(np.logical_and(min_state < state, state < max_state)) + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + + is_healthy = all((healthy_state, healthy_z, healthy_angle)) + + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + # TODO remove after validation + assert terminated == ( + not self.is_healthy if self._terminate_when_unhealthy else False + ) + return terminated + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = np.clip(self.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_survive": healthy_reward, + "x_position": x_position_after, + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + "x_velocity": x_velocity, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def viewer_setup(self): + assert self.viewer is not None + for key, value in DEFAULT_CAMERA_CONFIG.items(): + if isinstance(value, np.ndarray): + getattr(self.viewer.cam, key)[:] = value + else: + setattr(self.viewer.cam, key, value) + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + } diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py new file mode 100644 index 000000000..b45afc80b --- /dev/null +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -0,0 +1,524 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 1, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 2.0)), + "elevation": -20.0, +} + + +def mass_center(model, data): + mass = np.expand_dims(model.body_mass, axis=1) + xpos = data.xipos + return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() + + +class HumanoidEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the environment introduced by Tassa, Erez and Todorov + in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). + The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of + legs and arms. The legs each consist of three body parts, and the arms 2 body parts (representing the knees and + elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over. + + + ## Action Space + The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| + | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | + | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | + | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | + | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | + | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | + | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | + | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + + + ## Observation Space + Observations consist of positional values of different body parts of the Humanoid, + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. + + By default, observations do not include the x- and y-coordinates of the torso. These may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be a `Box(-Inf, Inf, (350,), float64)` where the first two observations + represent the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates + will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + However, by default, the observation is a `Box(-Inf, Inf, (348,), float64)`. The elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | + | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | + | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | + | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | + | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | + | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | + | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | + | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | + | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | + | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | + | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | + | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | + | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | + | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | + | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | + | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | + | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | + | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | + | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | + | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | + | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | + | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | + | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | + | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | + | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | + | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | + | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | + | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | + | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | + | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | + | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | + | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | + | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | + | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + + Additionally, after all the positional and velocity based values in the table, + the observation contains (in order): + - *cinert:* Mass and inertia of a single rigid body relative to the center of mass + (this is an intermediate result of transition). It has shape 13*10 (*nbody * 10*) + and hence adds to another 130 elements in the state space. + - *cvel:* Center of mass based velocity. It has shape 13 * 6 (*nbody * 6*) and hence + adds another 78 elements in the state space + - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape + `(17,)` *(nv * 1)* and hence adds another 17 elements to the state space. + - *cfrc_ext:* This is the center of mass based external force on the body. It has shape + 13 * 6 (*nbody * 6*) and hence adds to another 78 elements in the state space. + where *nbody* stands for the number of bodies in the robot and *nv* stands for the + number of degrees of freedom (*= dim(qvel)*) + + The body parts are: + + | id (for `v2`, `v3`, `v4)` | id (for `v5`) | body part | + | ---| --- | ------------ | + | 0 |excluded| worldbody (note: all values are constant 0) | + | 1 | 0 | torso | + | 2 | 1 | lwaist | + | 3 | 2 | pelvis | + | 4 | 3 | right_thigh | + | 5 | 4 | right_sin | + | 6 | 5 | right_foot | + | 7 | 6 | left_thigh | + | 8 | 7 | left_sin | + | 9 | 8 | left_foot | + | 10 | 9 | right_upper_arm | + | 11 | 10 | right_lower_arm | + | 12 | 11 | left_upper_arm | + | 13 | 12 | left_lower_arm | + + The joints are: + + | id (for `v2`, `v3`, `v4)` | id (for `v5`) | joint | + | ---| --- | ------------ | + | 0 |excluded| root (note: all values are constant 0) | + | 1 |excluded| root (note: all values are constant 0) | + | 2 |excluded| root (note: all values are constant 0) | + | 3 |excluded| root (note: all values are constant 0) | + | 4 |excluded| root (note: all values are constant 0) | + | 5 |excluded| root (note: all values are constant 0) | + | 6 | 0 | abdomen_z | + | 7 | 1 | abdomen_y | + | 8 | 2 | abdomen_x | + | 9 | 3 | right_hip_x | + | 10 | 4 | right_hip_z | + | 11 | 5 | right_hip_y | + | 12 | 6 | right_knee | + | 13 | 7 | left_hip_x | + | 14 | 8 | left_hiz_z | + | 15 | 9 | left_hip_y | + | 16 | 10 | left_knee | + | 17 | 11 | right_shoulder1 | + | 18 | 12 | right_shoulder2 | + | 19 | 13 | right_elbow| + | 20 | 14 | left_shoulder1 | + | 21 | 15 | left_shoulder2 | + | 22 | 16 | left_elfbow | + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the + [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + **Note:** Humanoid-v4 environment no longer has the following contact forces issue. + If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 + results in the contact forces always being 0. As such we recommend to use a Mujoco-Py + version < 2.0 when using the Humanoid environment if you would like to report results + with contact forces (if contact forces are not used in your experiments, you can use + version > 2.0). + + + ## Rewards + The reward consists of three parts: + - *healthy_reward*: + Every timestep that the Humanoid is alive (see section Episode Termination for definition), + it gets a reward of fixed value `healthy_reward`. + - *forward_reward*: + A reward of moving forward, + this reward would be positive if the Humanoid moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the center of mass ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 5), + and `frametime` which is 0.001 - making the default $dt = 5 \times 0.003 = 0.015$, + $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). + The calculation for the center of mass is defined in the `.py` file for the Humanoid. + - *ctrl_cost*: + A negative reward for penalizing the Humanoid if it takes actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $0.1$). + If there are *nu* actuators/controls, then the control has shape `nu x 1`. + - *contact_cost*: + A negative reward for penalizing the Humanoid if the external contact forces are too large. + $w_{contact} \times clamp(contact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where + $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-7}$), + $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). + + The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* + and `info` will also contain the individual reward terms. + + Note: in `v4` the total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* + + + ## Starting State + All observations start in state + (0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range + of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table) + for stochasticity. Note that the initial z coordinate is intentionally + selected to be high, thereby indicating a standing up humanoid. The initial + orientation is designed to make it face forward as well. + + + ## Episode End + #### Termination + If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Humanoid is unhealthy. + The Humanoid is said to be unhealthy if any of the following happens: + + 1. The z-position of the torso (the height) is no longer contained in `healthy_z_range`. + + #### Truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('Humanoid-v5', ctrl_cost_weight=0.1, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"humanoid.xml"` | Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1.25` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ term (see section on reward) | + | `contact_cost_weight` | **float** | `5e-7` | Weight for _contact_cost_ term (see section on reward) | + | `contact_cost_range` | **float** | `(-np.inf, 10.0) | Clamps the _contact_cost_ term (see section on reward) | + | `healthy_reward` | **float** | `5.0` | Weight for _healthy_reward_ term (see section on reward) | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | + | `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range | + | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + | `include_cinert_in_observation` | **bool** | `True` | Whether to include *cinert* elements in the observations. | + | `include_cvel_in_observation` | **bool** | `True` | Whether to include *cvel* elements in the observations. | + | `include_qfrc_actuator_in_observation` | **bool** | `True` | Whether to include *qfrc_actuator* elements in the observations. | + | `include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="humanoid.xml", + frame_skip=5, + default_camera_config=DEFAULT_CAMERA_CONFIG, + forward_reward_weight=1.25, + ctrl_cost_weight=0.1, + contact_cost_weight=5e-7, + contact_cost_range=(-np.inf, 10.0), + healthy_reward=5.0, + terminate_when_unhealthy=True, + healthy_z_range=(1.0, 2.0), + reset_noise_scale=1e-2, + exclude_current_positions_from_observation=True, + include_cinert_in_observation=True, + include_cvel_in_observation=True, + include_qfrc_actuator_in_observation=True, + include_cfrc_ext_in_observation=True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + contact_cost_weight, + contact_cost_range, + healthy_reward, + terminate_when_unhealthy, + healthy_z_range, + reset_noise_scale, + exclude_current_positions_from_observation, + include_cinert_in_observation, + include_cvel_in_observation, + include_qfrc_actuator_in_observation, + include_cfrc_ext_in_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._contact_cost_weight = contact_cost_weight + self._contact_cost_range = contact_cost_range + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + self._include_cinert_in_observation = include_cinert_in_observation + self._include_cvel_in_observation = include_cvel_in_observation + self._include_qfrc_actuator_in_observation = ( + include_qfrc_actuator_in_observation + ) + self._include_cfrc_ext_in_observation = include_cfrc_ext_in_observation + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = self.data.qpos.size + self.data.qvel.size + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += self.data.cinert[1:].size * include_cinert_in_observation + obs_size += self.data.cvel[1:].size * include_cvel_in_observation + obs_size += (self.data.qvel.size - 6) * include_qfrc_actuator_in_observation + obs_size += self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation + + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + "cinert": self.data.cinert[1:].size * include_cinert_in_observation, + "cvel": self.data.cvel[1:].size * include_cvel_in_observation, + "qfrc_actuator": (self.data.qvel.size - 6) + * include_qfrc_actuator_in_observation, + "cfrc_ext": self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation, + "ten_lenght": 0, + "ten_velocity": 0, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl)) + return control_cost + + @property + def contact_cost(self): + contact_forces = self.data.cfrc_ext + contact_cost = self._contact_cost_weight * np.sum(np.square(contact_forces)) + min_cost, max_cost = self._contact_cost_range + contact_cost = np.clip(contact_cost, min_cost, max_cost) + return contact_cost + + @property + def is_healthy(self): + min_z, max_z = self._healthy_z_range + is_healthy = min_z < self.data.qpos[2] < max_z + + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + # TODO remove after validation + assert terminated == ( + not self.is_healthy if self._terminate_when_unhealthy else False + ) + return terminated + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._include_cinert_in_observation is True: + com_inertia = self.data.cinert[1:].flat.copy() + else: + com_inertia = np.array([]) + if self._include_cvel_in_observation is True: + com_velocity = self.data.cvel[1:].flat.copy() + else: + com_velocity = np.array([]) + + if self._include_qfrc_actuator_in_observation is True: + actuator_forces = self.data.qfrc_actuator[6:].flat.copy() + else: + actuator_forces = np.array([]) + if self._include_cfrc_ext_in_observation is True: + external_contact_forces = self.data.cfrc_ext[1:].flat.copy() + else: + external_contact_forces = np.array([]) + + # TODO remove after validation + assert (self.data.cinert[0].flat.copy() == 0).all() + assert (self.data.cvel[0].flat.copy() == 0).all() + assert (self.data.qfrc_actuator[:6].flat.copy() == 0).all() + assert (self.data.cfrc_ext[0].flat.copy() == 0).all() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + return np.concatenate( + ( + position, + velocity, + com_inertia, + com_velocity, + actuator_forces, + external_contact_forces, + ) + ) + + def step(self, action): + xy_position_before = mass_center(self.model, self.data) + self.do_simulation(action, self.frame_skip) + xy_position_after = mass_center(self.model, self.data) + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + costs = ctrl_cost + contact_cost + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + + observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated + info = { + "reward_survive": healthy_reward, + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_contact": -contact_cost, + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + } diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py new file mode 100644 index 000000000..ba13b9f13 --- /dev/null +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -0,0 +1,469 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 1, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 0.8925)), + "elevation": -20.0, +} + + +class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment is based on the environment introduced by Tassa, Erez and Todorov + in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). + The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a + pair of legs and arms. The legs each consist of two links, and so the arms (representing the + knees and elbows respectively). The environment starts with the humanoid laying on the ground, + and then the goal of the environment is to make the humanoid standup and then keep it standing + by applying torques on the various hinges. + + + ## Action Space + The agent take a 17-element vector for actions. + + The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action` + represents the numerical torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | + | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | + | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | + | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | + | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | + | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | + | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + + + ## Observation Space + Observations consist of positional values of different body parts of the Humanoid, + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. + + By default, observations do not include the x- and y-coordinates of the torso. These may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be a `Box(-Inf, Inf, (350,), float64)` where the first two observations + represent the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates + will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + However, by default, the observation is a `Box(-Inf, Inf, (348,), float64)`. The elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | + | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | + | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | + | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | + | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | + | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | + | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | + | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | + | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | + | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | + | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | + | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | + | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | + | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | + | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | + | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | + | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | + | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | + | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | + | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | + | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | + | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | + | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | + | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | + | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | + | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | + | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | + | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | + | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | + | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | + | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | + | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | + | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | + | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | + | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | + | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + + Additionally, after all the positional and velocity based values in the table, + the observation contains (in order): + - *cinert:* Mass and inertia of a single rigid body relative to the center of mass + (this is an intermediate result of transition). It has shape 13*10 (*nbody * 10*) + and hence adds to another 130 elements in the state space. + - *cvel:* Center of mass based velocity. It has shape 13 * 6 (*nbody * 6*) and hence + adds another 78 elements in the state space + - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape + `(17,)` *(nv * 1)* and hence adds another 17 elements to the state space. + - *cfrc_ext:* This is the center of mass based external force on the body. It has shape + 13 * 6 (*nbody * 6*) and hence adds to another 78 elements in the state space. + where *nbody* stands for the number of bodies in the robot and *nv* stands for the + number of degrees of freedom (*= dim(qvel)*) + + The body parts are: + + | id (for `v2`, `v3`, `v4)` | id (for `v5`) | body part | + | ---| --- | ------------ | + | 0 |excluded| worldbody (note: all values are constant 0) | + | 1 | 0 | torso | + | 2 | 1 | lwaist | + | 3 | 2 | pelvis | + | 4 | 3 | right_thigh | + | 5 | 4 | right_sin | + | 6 | 5 | right_foot | + | 7 | 6 | left_thigh | + | 8 | 7 | left_sin | + | 9 | 8 | left_foot | + | 10 | 9 | right_upper_arm | + | 11 | 10 | right_lower_arm | + | 12 | 11 | left_upper_arm | + | 13 | 12 | left_lower_arm | + + The joints are: + + | id (for `v2`, `v3`, `v4)` | id (for `v5`) | joint | + | ---| --- | ------------ | + | 0 |excluded| root (note: all values are constant 0) | + | 1 |excluded| root (note: all values are constant 0) | + | 2 |excluded| root (note: all values are constant 0) | + | 3 |excluded| root (note: all values are constant 0) | + | 4 |excluded| root (note: all values are constant 0) | + | 5 |excluded| root (note: all values are constant 0) | + | 6 | 0 | abdomen_z | + | 7 | 1 | abdomen_y | + | 8 | 2 | abdomen_x | + | 9 | 3 | right_hip_x | + | 10 | 4 | right_hip_z | + | 11 | 5 | right_hip_y | + | 12 | 6 | right_knee | + | 13 | 7 | left_hip_x | + | 14 | 8 | left_hiz_z | + | 15 | 9 | left_hip_y | + | 16 | 10 | left_knee | + | 17 | 11 | right_shoulder1 | + | 18 | 12 | right_shoulder2 | + | 19 | 13 | right_elbow| + | 20 | 14 | left_shoulder1 | + | 21 | 15 | left_shoulder2 | + | 22 | 16 | left_elfbow | + + The (x,y,z) coordinates are translational DOFs while the orientations are rotational + DOFs expressed as quaternions. One can read more about free joints on the + [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + + **Note:** Humanoid-v4 environment no longer has the following contact forces issue. + If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 + results in the contact forces always being 0. As such we recommend to use a Mujoco-Py + version < 2.0 when using the Humanoid environment if you would like to report results + with contact forces (if contact forces are not used in your experiments, you can use + version > 2.0). + + + ## Rewards + The reward consists of three parts: + - *uph_cost*: + A reward for moving upward (in an attempt to stand up). + This is not a relative reward which measures how much upward it has moved from the last timestep, + but it is an absolute reward which measures how much upward the Humanoid has moved overall. + It is measured as $weight_{uph} \times (z_{after action} - 0)/dt$, + where $z_{after action}$ is the z coordinate of the torso after taking an action, + and *dt* is the time between actions and is dependent on the `frame_skip` parameter + (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*. + and $weight_{uph}$ is `uph_cost_weight`. + - *quad_ctrl_cost*: + A negative reward for penalizing the Humanoid if it takes actions that are too large. + $w_{quad_control} \times \\|action\\|_2^2$, + where $w_{quad_control}$ is `ctrl_cost_weight` (default is $0.1$). + If there are *nu* actuators/controls, then the control has shape `nu x 1`. + - *impact_cost*: + A negative reward for penalizing the Humanoid if the external contact forces are too large. + $w_{impact} \times clamp(impact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where + $w_{impact}$ is `impact_cost_weight` (default is $5\times10^{-7}$), + $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). + + The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*, + and `info` will also contain the individual reward terms. + + + ## Starting State + All observations start in state + (0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of + [-0.01, 0.01] added to the positional and velocity values (values in the table) + for stochasticity. Note that the initial z coordinate is intentionally selected + to be low, thereby indicating a laying down humanoid. The initial orientation is + designed to make it face forward as well. + + + ## Episode End + #### Termination + The Humanoid never terminates. + + #### Truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('HumanoidStandup-v5', impact_cost_weight=0.5e-6, ....) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** | `"humanoidstandup.xml"` | Path to a MuJoCo model | + | `uph_cost_weight` | **float** | `1` | Weight for _uph_cost_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `0.1` | Weight for _quad_ctrl_cost_ term (see section on reward) | + | `impact_cost_weight` | **float** | `0.5e-6` | Weight for _impact_cost_ term (see section on reward) | + | `impact_cost_range` | **float** | `(-np.inf, 10.0) | Clamps the _impact_cost_ | + | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + | `include_cinert_in_observation` | **bool** | `True` | Whether to include *cinert* elements in the observations. | + | `include_cvel_in_observation` | **bool** | `True` | Whether to include *cvel* elements in the observations. | + | `include_qfrc_actuator_in_observation` | **bool** | `True` | Whether to include *qfrc_actuator* elements in the observations. | + | `include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0). + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="humanoidstandup.xml", + frame_skip=5, + default_camera_config=DEFAULT_CAMERA_CONFIG, + uph_cost_weight=1, + ctrl_cost_weight=0.1, + impact_cost_weight=0.5e-6, + impact_cost_range=(-np.inf, 10.0), + reset_noise_scale=1e-2, + exclude_current_positions_from_observation=True, + include_cinert_in_observation=True, + include_cvel_in_observation=True, + include_qfrc_actuator_in_observation=True, + include_cfrc_ext_in_observation=True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + uph_cost_weight, + ctrl_cost_weight, + impact_cost_weight, + impact_cost_range, + reset_noise_scale, + exclude_current_positions_from_observation, + include_cinert_in_observation, + include_cvel_in_observation, + include_qfrc_actuator_in_observation, + include_cfrc_ext_in_observation, + **kwargs, + ) + + self._uph_cost_weight = uph_cost_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._impact_cost_weight = impact_cost_weight + self._impact_cost_range = impact_cost_range + self._reset_noise_scale = reset_noise_scale + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + self._include_cinert_in_observation = include_cinert_in_observation + self._include_cvel_in_observation = include_cvel_in_observation + self._include_qfrc_actuator_in_observation = ( + include_qfrc_actuator_in_observation + ) + self._include_cfrc_ext_in_observation = include_cfrc_ext_in_observation + + obs_size = 47 + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += 130 * include_cinert_in_observation + obs_size += 78 * include_cvel_in_observation + obs_size += 17 * include_qfrc_actuator_in_observation + obs_size += 78 * include_cfrc_ext_in_observation + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = self.data.qpos.size + self.data.qvel.size + obs_size -= 2 * exclude_current_positions_from_observation + obs_size += self.data.cinert[1:].size * include_cinert_in_observation + obs_size += self.data.cvel[1:].size * include_cvel_in_observation + obs_size += (self.data.qvel.size - 6) * include_qfrc_actuator_in_observation + obs_size += self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation + + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + "cinert": self.data.cinert[1:].size * include_cinert_in_observation, + "cvel": self.data.cvel[1:].size * include_cvel_in_observation, + "qfrc_actuator": (self.data.qvel.size - 6) + * include_qfrc_actuator_in_observation, + "cfrc_ext": self.data.cfrc_ext[1:].size * include_cfrc_ext_in_observation, + "ten_lenght": 0, + "ten_velocity": 0, + } + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._include_cinert_in_observation is True: + com_inertia = self.data.cinert[1:].flat.copy() + else: + com_inertia = np.array([]) + if self._include_cvel_in_observation is True: + com_velocity = self.data.cvel[1:].flat.copy() + else: + com_velocity = np.array([]) + + if self._include_qfrc_actuator_in_observation is True: + actuator_forces = self.data.qfrc_actuator[6:].flat.copy() + else: + actuator_forces = np.array([]) + if self._include_cfrc_ext_in_observation is True: + external_contact_forces = self.data.cfrc_ext[1:].flat.copy() + else: + external_contact_forces = np.array([]) + + # TODO remove after validation + assert (self.data.cinert[0].flat.copy() == 0).all() + assert (self.data.cvel[0].flat.copy() == 0).all() + assert (self.data.qfrc_actuator[:6].flat.copy() == 0).all() + assert (self.data.cfrc_ext[0].flat.copy() == 0).all() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + return np.concatenate( + ( + position, + velocity, + com_inertia, + com_velocity, + actuator_forces, + external_contact_forces, + ) + ) + + def step(self, action): + self.do_simulation(action, self.frame_skip) + pos_after = self.data.qpos[2] + + uph_cost = (pos_after - 0) / self.model.opt.timestep + + quad_ctrl_cost = self._ctrl_cost_weight * np.square(self.data.ctrl).sum() + + quad_impact_cost = ( + self._impact_cost_weight * np.square(self.data.cfrc_ext).sum() + ) + min_impact_cost, max_impact_cost = self._impact_cost_range + quad_impact_cost = np.clip(quad_impact_cost, min_impact_cost, max_impact_cost) + + reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1 + + info = { + "reward_linup": uph_cost, + "reward_quadctrl": -quad_ctrl_cost, + "reward_impact": -quad_impact_cost, + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + } + + if self.render_mode == "human": + self.render() + return self._get_obs(), reward, False, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + } diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py new file mode 100644 index 000000000..028d856e5 --- /dev/null +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -0,0 +1,223 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 0, + "distance": 4.1225, + "lookat": np.array((0.0, 0.0, 0.12250000000000005)), +} + + +class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment originates from control theory and builds on the cartpole + environment based on the work done by Barto, Sutton, and Anderson in + ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), + powered by the Mujoco physics simulator - allowing for more complex experiments + (such as varying the effects of gravity or constraints). This environment involves a cart that can + moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one + (leaving the second pole as the only one with one free end). The cart can be pushed left or right, + and the goal is to balance the second pole on top of the first pole, which is in turn on top of the + cart, by applying continuous forces on the cart. + + + ## Action Space + The agent take a 1-element vector for actions. + The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the + numerical force applied to the cart (with magnitude representing the amount of force and + sign representing the direction) + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| + | 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) | + + + ## Observation Space + The state space consists of positional values of different body parts of the pendulum system, + followed by the velocities of those individual parts (their derivatives) with all the + positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(9,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | ----------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | + | 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless | + | 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless | + | 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless | + | 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless | + | 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | + | 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | + | 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) | + | 8 | constraint force - x | -Inf | Inf | slider | slide | Force (N) | + | excluded | constraint force - y | -Inf | Inf | hinge | slide | Force (N) | + | excluded | constraint force - z | -Inf | Inf | hinge2 | slide | Force (N) | + + + There is physical contact between the robots and their environment - and Mujoco + attempts at getting realistic physics simulations for the possible physical contact + dynamics by aiming for physical accuracy and computational efficiency. + + There is one constraint force for contacts for each degree of freedom (3). + The approach and handling of constraints by Mujoco is unique to the simulator and is based on their research. + Once can find more information in their [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html) + or in their paper ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf). + + + ## Rewards + The reward consists of two parts: + - *alive_bonus*: + The goal is to make the second inverted pendulum stand upright + (within a certain angle limit) as long as possible - as such a reward of `healthy_reward` is awarded + for each timestep that the second pole is upright. + - *distance_penalty*: + This reward is a measure of how far the *tip* of the second pendulum (the only free end) moves, + and it is calculated as $0.01 x_{pole2-tip}^2 + (y_{pole2-tip}-2)^2$, + where $x_{pole2-tip}, y_{pole2-tip}$ are the xy-coordinatesof the tip of the second pole. + - *velocity_penalty*: + A negative reward for penalising the agent if it moves too fast. + $10^{-3} \omega_1 + 5 10^{-3} \omega_2$, + where $\omega_1, \omega_2$ are the angular velocities of the hinges. + + The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty* + and `info` will also contain the individual reward terms. + + + ## Starting State + All observations start in state + (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range + of `[-reset_noise_scale, reset_noise_scale]` added to the positional values (cart position and pole angles) and standard + normal force with a standard deviation of `reset_noise_scale` added to the velocity values for stochasticity. + + + ## Episode End + #### Termination + The environment terminates when the Inverted Double Pendulum is unhealthy. + The Inverted Double Pendulum is unhealthy if any of the following happens: + + 1.Termination: The y_coordinate of the tip of the second pole $\leq 1$. + The maximum standing height of the system is 1.2 m when all the parts are perpendicularly vertical on top of each other). + + #### Truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`, `healthy_reward`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('InvertedDoublePendulum-v5', healthy_reward=10, ...) + ``` + + | Parameter | Type | Default | Description | + |-------------------------|------------|-------------- |-------------------------------| + | `xml_file` | **str** |`"inverted_double_pendulum.xml"`| Path to a MuJoCo model | + | `healthy_reward` | **float** | `10 | Constant reward given if the pendulum is "healthy" (upright) | + | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="inverted_double_pendulum.xml", + frame_skip=5, + healthy_reward=10.0, + reset_noise_scale=0.1, + **kwargs, + ): + utils.EzPickle.__init__(self, xml_file, frame_skip, reset_noise_scale, **kwargs) + + self._healthy_reward = healthy_reward + self._reset_noise_scale = reset_noise_scale + + observation_space = Box(low=-np.inf, high=np.inf, shape=(9,), dtype=np.float64) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + def step(self, action): + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + + x, _, y = self.data.site_xpos[0] + v1, v2 = self.data.qvel[1:3] + + terminated = bool(y <= 1) + + dist_penalty = 0.01 * x**2 + (y - 2) ** 2 + vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 + alive_bonus = self._healthy_reward * int(not terminated) + reward = alive_bonus - dist_penalty - vel_penalty + + info = { + "reward_survive": alive_bonus, + "distance_penalty": -dist_penalty, + "velocity_penalty": -vel_penalty, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def _get_obs(self): + assert self.data.qfrc_constraint[2] == 0 # TODO remove after validation + assert self.data.qfrc_constraint[1] == 0 # TODO remove after validation + return np.concatenate( + [ + self.data.qpos[:1], # cart x pos + np.sin(self.data.qpos[1:]), # link angles + np.cos(self.data.qpos[1:]), + np.clip(self.data.qvel, -10, 10), + np.clip(self.data.qfrc_constraint, -10, 10)[:1], + ] + ).ravel() + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + self.set_state( + self.init_qpos + + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ), + self.init_qvel + + self.np_random.standard_normal(self.model.nv) * self._reset_noise_scale, + ) + return self._get_obs() diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_pendulum_v5.py new file mode 100644 index 000000000..ac3f940f0 --- /dev/null +++ b/gymnasium/envs/mujoco/inverted_pendulum_v5.py @@ -0,0 +1,178 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 0, + "distance": 2.04, +} + + +class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): + """ + ## Description + This environment is the cartpole environment based on the work done by + Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can + solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), + just like in the classic environments but now powered by the Mujoco physics simulator - + allowing for more complex experiments (such as varying the effects of gravity). + This environment involves a cart that can moved linearly, with a pole fixed on it + at one end and having another end free. The cart can be pushed left or right, and the + goal is to balance the pole on the top of the cart by applying forces on the cart. + + + ## Action Space + The agent take a 1-element vector for actions. + + The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents + the numerical force applied to the cart (with magnitude representing the amount of + force and sign representing the direction) + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| + | 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) | + + + ## Observation Space + The state space consists of positional values of different body parts of + the pendulum system, followed by the velocities of those individual parts (their derivatives) + with all the positions ordered before all the velocities. + + The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | --------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------- | + | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | + | 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) | + | 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | + | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) | + + + ## Rewards + The goal is to make the inverted pendulum stand upright (within a certain angle limit) + as long as possible - as such a reward of +1 is awarded for each timestep that + the pole is upright. + + The pole is considered upright if: + $|angle| < 0.2$ + + and `info` will also contain the reward. + + + ## Starting State + All observations start in state + (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range + of `[-reset_noise_scale, reset_noise_scale]` added to the values for stochasticity. + + + ## Episode End + #### Termination + The environment terminates when the Inverted Pendulum is unhealthy. + The Inverted Pendulum is unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite. + 2. The absolute value of the vertical angle between the pole and the cart is greater than 0.2 radian. + + #### Truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `reset_noise_scale`. + + ```python + import gymnasium as gym + env = gym.make('InvertedPendulum-v5', reset_noise_scale=0.1) + ``` + + | Parameter | Type | Default |Description | + |-------------------------|------------|--------------|-------------------------------| + | `xml_file` | **str** | `"inverted_double_pendulum.xml"` | Path to a MuJoCo model | + | `reset_noise_scale` | **float** | `0.01` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.. + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.5. + * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum). + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="inverted_pendulum.xml", + frame_skip=2, + reset_noise_scale=0.01, + **kwargs, + ): + utils.EzPickle.__init__(self, xml_file, frame_skip, reset_noise_scale, **kwargs) + observation_space = Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float64) + + self._reset_noise_scale = reset_noise_scale + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=DEFAULT_CAMERA_CONFIG, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + self.observation_structure = { + "qpos": self.data.qpos.size, + "qvel": self.data.qvel.size, + } + + def step(self, action): + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + + terminated = bool( + not np.isfinite(observation).all() or (np.abs(observation[1]) > 0.2) + ) + + reward = int(not terminated) + + info = {"reward_survive": reward} + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + size=self.model.nq, low=noise_low, high=noise_high + ) + qvel = self.init_qvel + self.np_random.uniform( + size=self.model.nv, low=noise_low, high=noise_high + ) + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate([self.data.qpos, self.data.qvel]).ravel() diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py new file mode 100644 index 000000000..58310f28c --- /dev/null +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -0,0 +1,245 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": -1, + "distance": 4.0, +} + + +class PusherEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + "Pusher" is a multi-jointed robot arm which is very similar to that of a human. + The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*). + The robot consists of shoulder, elbow, forearm, and wrist joints. + + + ## Action Space + The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | + | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | + | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | + | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | + | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | + | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | + | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | + + + ## Observation Space + Observations consist of + + - Angle of rotational joints on the pusher + - Angular velocities of rotational joints on the pusher + - The coordinates of the fingertip of the pusher + - The coordinates of the object to be moved + - The coordinates of the goal position + + The observation is a `Box(-Inf, Inf, (23,), float64)` where the elements correspond to the table below. + An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the + same as human joints. + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | -------------------------------------------------------- | ---- | --- | -------------------------------- | -------- | ------------------------ | + | 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) | + | 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) | + | 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) | + | 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) | + | 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) | + | 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) | + | 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) | + | 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) | + | 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) | + | 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) | + | 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) | + | 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) | + | 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) | + | 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) | + | 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | + | 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) | + | 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) | + | 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) | + | 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) | + | 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) | + | 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) | + + + ## Rewards + The reward consists of two parts: + - *reward_near*: + This reward is a measure of how far the *fingertip* of the pusher (the unattached end) is from the object, + with a more negative value assigned for when the pusher's *fingertip* is further away from the target. + It is $-w_{near} \|(P_{fingertip} - P_{target})\|_2$. + where $w_{near}$ is `reward_near_weight`. + - *reward_dist*: + This reward is a measure of how far the object is from the target goal position, + with a more negative value assigned for object that is further away from the target. + It is $-w_{dist} \|(P_{object} - P_{target})\|_2$. + where $w_{dist}$ is `reward_dist_weight`. + - *reward_control*: + A negative reward for penalising the pusher if it takes actions that are too large. + It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. + where $w_{control}$ is `reward_control_weight`. + + The total reward returned is ***reward*** *=* *reward_dist + reward_ctrl + reward_near*, + `info` will also contain the individual reward terms. + + + ## Starting State + All pusher (not including object and goal) states start in + (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range + [-0.005, 0.005] is added to the velocity attributes only. The velocities of + the object and goal are permanently set to 0. The object's x-position is selected uniformly + between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this + process is repeated until the vector norm between the object's (x,y) position and origin is not greater + than 0.17. The goal always have the same position of (0.45, -0.05, -0.323). + + The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05* + + + ## Episode End + #### Termination + The Pusher never terminates. + + #### Truncation + The maximum duration of an episode is 100 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`. + + ```python + import gymnasium as gym + env = gym.make('Pusher-v5', xml_file=...) + ``` + + | Parameter | Type | Default |Description | + |-------------------------|------------|--------------|----------------------------------------------------------| + | `xml_file` | **str** |`"pusher.xml"`| Path to a MuJoCo model | + | `reward_near_weight` | **float** | `0.5` | Weight for *reward_near* term (see section on reward) | + | `reward_dist_weight` | **float** | `1` | Weight for *reward_dist* term (see section on reward) | + | `reward_control_weight` | **float** | `0.1` | Weight for *reward_control* term (see section on reward) | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks (not including pusher, which has a max_time_steps of 100). Added reward_threshold to environments. + * v0: Initial versions release (1.0.0). + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="pusher.xml", + frame_skip=5, + default_camera_config=DEFAULT_CAMERA_CONFIG, + reward_near_weight=0.5, + reward_dist_weight=1, + reward_control_weight=0.1, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + reward_near_weight, + reward_dist_weight, + reward_control_weight, + **kwargs, + ) + self._reward_near_weight = reward_near_weight + self._reward_dist_weight = reward_dist_weight + self._reward_control_weight = reward_control_weight + + observation_space = Box(low=-np.inf, high=np.inf, shape=(23,), dtype=np.float64) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + def step(self, action): + vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm") + vec_2 = self.get_body_com("object") - self.get_body_com("goal") + + reward_near = -np.linalg.norm(vec_1) * self._reward_near_weight + reward_dist = -np.linalg.norm(vec_2) * self._reward_dist_weight + reward_ctrl = -np.square(action).sum() * self._reward_control_weight + reward = reward_dist + reward_ctrl + reward_near + + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + info = { + "reward_dist": reward_dist, + "reward_ctrl": reward_ctrl, + "reward_near": reward_near, + } + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def reset_model(self): + qpos = self.init_qpos + + self.goal_pos = np.asarray([0, 0]) + while True: + self.cylinder_pos = np.concatenate( + [ + self.np_random.uniform(low=-0.3, high=0, size=1), + self.np_random.uniform(low=-0.2, high=0.2, size=1), + ] + ) + if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17: + break + + qpos[-4:-2] = self.cylinder_pos + qpos[-2:] = self.goal_pos + qvel = self.init_qvel + self.np_random.uniform( + low=-0.005, high=0.005, size=self.model.nv + ) + qvel[-4:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + return np.concatenate( + [ + self.data.qpos.flat[:7], + self.data.qvel.flat[:7], + self.get_body_com("tips_arm"), + self.get_body_com("object"), + self.get_body_com("goal"), + ] + ) diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py new file mode 100644 index 000000000..2773bc034 --- /dev/null +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -0,0 +1,219 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} + + +class ReacherEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a + target that is spawned at a random position. + + + ## Action Space + The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------| + | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | + | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | + + + ## Observation Space + Observations consist of + + - The cosine of the angles of the two arms + - The sine of the angles of the two arms + - The coordinates of the target + - The angular velocities of the arms + - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0) + + The observation is a `Box(-Inf, Inf, (10,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | ---------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | + | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | + | 2 | sine of the angle of the first arm | -Inf | Inf | sin(joint0) | hinge | unitless | + | 3 | sine of the angle of the second arm | -Inf | Inf | sin(joint1) | hinge | unitless | + | 4 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | + | 5 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | + | 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) | + | 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) | + | 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | + | 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | + | excluded | z-value of position_fingertip - position_target (constantly 0 since reacher is 2d) | -Inf | Inf | NA | slide | position (m) | + + Most Gym environments just return the positions and velocity of the + joints in the `.xml` file as the state of the environment. However, in + reacher the state is created by combining only certain elements of the + position and velocity, and performing some function transformations on them. + If one is to read the `reacher.xml` then they will find 4 joints: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------| + | 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) | + | 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) | + | 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | + | 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | + + + ## Rewards + The reward consists of two parts: + - *reward_distance*: + This reward is a measure of how far the *fingertip* of the reacher (the unattached end) is from the target, + with a more negative value assigned for when the reacher's *fingertip* is further away from the target. + It is $-w_{near} \|(P_{fingertip} - P_{target})\|_2$. + where $w_{near}$ is `reward_near_weight`. + - *reward_control*: + A negative reward for penalising the walker if it takes actions that are too large. + It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. + where $w_{control}$ is `reward_control_weight`. + + The total reward returned is ***reward*** *=* *reward_distance + reward_control*. + `info` will also contain the individual reward terms. + + ## Starting State + All observations start in state + (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + with a noise added for stochasticity. A uniform noise in the range + [-0.1, 0.1] is added to the positional attributes, while the target position + is selected uniformly at random in a disk of radius 0.2 around the origin. + Independent, uniform noise in the + range of [-0.005, 0.005] is added to the velocities, and the last + element ("fingertip" - "target") is calculated at the end once everything + is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02* + + ## Episode End + #### Termination + The Reacher never terminates. + + #### Truncation + The maximum duration of an episode is 50 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`. + + ```python + import gymnasium as gym + env = gym.make('Reacher-v5', xml_file=...) + ``` + + | Parameter | Type | Default |Description | + |-------------------------|------------|--------------|----------------------------------------------------------| + | `xml_file` | **str** |`"reacher.xml"`| Path to a MuJoCo model | + | `reward_dist_weight` | **float** | `1` | Weight for *reward_dist* term (see section on reward) | + | `reward_control_weight` | **float** | `0.1` | Weight for *reward_control* term (see section on reward) | + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: This environment does not have a v3 release. + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="reacher.xml", + frame_skip=2, + default_camera_config=DEFAULT_CAMERA_CONFIG, + reward_dist_weight=1, + reward_control_weight=1, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + reward_dist_weight, + reward_control_weight, + **kwargs, + ) + + self._reward_dist_weight = reward_dist_weight + self._reward_control_weight = reward_control_weight + + observation_space = Box(low=-np.inf, high=np.inf, shape=(10,), dtype=np.float64) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=observation_space, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + def step(self, action): + vec = self.get_body_com("fingertip") - self.get_body_com("target") + reward_dist = -np.linalg.norm(vec) * self._reward_dist_weight + reward_ctrl = -np.square(action).sum() * self._reward_control_weight + reward = reward_dist + reward_ctrl + + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + info = { + "reward_dist": reward_dist, + "reward_ctrl": reward_ctrl, + } + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def reset_model(self): + qpos = ( + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + + self.init_qpos + ) + while True: + self.goal = self.np_random.uniform(low=-0.2, high=0.2, size=2) + if np.linalg.norm(self.goal) < 0.2: + break + qpos[-2:] = self.goal + qvel = self.init_qvel + self.np_random.uniform( + low=-0.005, high=0.005, size=self.model.nv + ) + qvel[-2:] = 0 + self.set_state(qpos, qvel) + return self._get_obs() + + def _get_obs(self): + theta = self.data.qpos.flat[:2] + assert (self.get_body_com("fingertip") - self.get_body_com("target"))[ + 2 + ] == 0 # TODO remove after validation + return np.concatenate( + [ + np.cos(theta), + np.sin(theta), + self.data.qpos.flat[2:], + self.data.qvel.flat[:2], + (self.get_body_com("fingertip") - self.get_body_com("target"))[:2], + ] + ) diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py new file mode 100644 index 000000000..2af6334b5 --- /dev/null +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -0,0 +1,259 @@ +__credits__ = ["Kallinteris-Andreas", "Rushiv Arora"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + + +class SwimmerEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis + ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document). + The environment aims to increase the number of independent state and control + variables as compared to the classic control environments. The swimmers + consist of three or more segments ('***links***') and one less articulation + joints ('***rotors***') - one rotor joint connecting exactly two links to + form a linear chain. The swimmer is suspended in a two dimensional pool and + always starts in the same position (subject to some deviation drawn from an + uniform distribution), and the goal is to move as fast as possible towards + the right by applying torque on the rotors and using the fluids friction. + + ## Notes + + The problem parameters are: + Problem parameters: + * *n*: number of body parts + * *mi*: mass of part *i* (*i* ∈ {1...n}) + * *li*: length of part *i* (*i* ∈ {1...n}) + * *k*: viscous-friction coefficient + + While the default environment has *n* = 3, *li* = 0.1, + and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the + number of links, or to tweak any of the parameters. + + + ## Action Space + The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links* + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the first rotor | -1 | 1 | motor1_rot | hinge | torque (N m) | + | 1 | Torque applied on the second rotor | -1 | 1 | motor2_rot | hinge | torque (N m) | + + + ## Observation Space + By default, observations consists of: + * θi: angle of part *i* with respect to the *x* axis + * θi': its derivative with respect to time (angular velocity) + + In the default case, observations do not include the x- and y-coordinates of the front tip. These may + be included by passing `exclude_current_positions_from_observation=False` during construction. + Then, the observation space will be `Box(-Inf, Inf, (10,), float64)` where the first two observations + represent the x- and y-coordinates of the front tip. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates + will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, the observation is a `Box(-Inf, Inf, (8,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | + | 0 | angle of the front tip | -Inf | Inf | free_body_rot | hinge | angle (rad) | + | 1 | angle of the first rotor | -Inf | Inf | motor1_rot | hinge | angle (rad) | + | 2 | angle of the second rotor | -Inf | Inf | motor2_rot | hinge | angle (rad) | + | 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) | + | 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | + | 5 | angular velocity of front tip | -Inf | Inf | free_body_rot | hinge | angular velocity (rad/s) | + | 6 | angular velocity of first rotor | -Inf | Inf | motor1_rot | hinge | angular velocity (rad/s) | + | 7 | angular velocity of second rotor | -Inf | Inf | motor2_rot | hinge | angular velocity (rad/s) | + | excluded | position of the tip along the x-axis | -Inf | Inf | slider1 | slide | position (m) | + | excluded | position of the tip along the y-axis | -Inf | Inf | slider2 | slide | position (m) | + + + ## Rewards + The reward consists of two parts: + - *forward_reward*: + A reward of moving forward, + this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 4), + and `frametime` which is 0.01 - making the default $dt = 4 \times 0.01 = 0.04$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward for penalizing the Swimmer if it takes actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-4}$). + + The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost*, + and `info` will also contain the individual reward terms + + + ## Starting State + All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity. + + + ## Episode End + #### Termination + The Swimmer never terminates. + + #### truncation + The maximum duration of an episode is 1000 timesteps. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`. + + ```python + import gymnasium as gym + env = gym.make('Swimmer-v5', xml_file=...) + ``` + + | Parameter | Type | Default |Description | + |--------------------------------------------| --------- |-------------- |-------------------------------| + |`xml_file` | **str** |`"swimmer.xml"`| Path to a MuJoCo model | + |`forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward)| + |`ctrl_cost_weight` | **float** | `1e-4` | Weight for _ctrl_cost_ term (see section on reward) | + |`reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + |`exclude_current_positions_from_observation`| **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen). + * v2: All continuous control environments now use mujoco-py >= 1.50. + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="swimmer.xml", + frame_skip=4, + forward_reward_weight=1.0, + ctrl_cost_weight=1e-4, + reset_noise_scale=0.1, + exclude_current_positions_from_observation=True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + forward_reward_weight, + ctrl_cost_weight, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__(self, xml_file, frame_skip, observation_space=None, **kwargs) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - 2 * exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 2 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 2 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + def step(self, action): + xy_position_before = self.data.qpos[0:2].copy() + self.do_simulation(action, self.frame_skip) + xy_position_after = self.data.qpos[0:2].copy() + + xy_velocity = (xy_position_after - xy_position_before) / self.dt + x_velocity, y_velocity = xy_velocity + + forward_reward = self._forward_reward_weight * x_velocity + + ctrl_cost = self.control_cost(action) + + observation = self._get_obs() + reward = forward_reward - ctrl_cost + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "x_position": xy_position_after[0], + "y_position": xy_position_after[1], + "distance_from_origin": np.linalg.norm(xy_position_after, ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + } + + if self.render_mode == "human": + self.render() + + return observation, reward, False, False, info + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = self.data.qvel.flat.copy() + + if self._exclude_current_positions_from_observation: + position = position[2:] + + observation = np.concatenate([position, velocity]).ravel() + return observation + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + } diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py new file mode 100644 index 000000000..e1b790e99 --- /dev/null +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -0,0 +1,328 @@ +__credits__ = ["Kallinteris-Andreas"] + +import numpy as np +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box + +DEFAULT_CAMERA_CONFIG = { + "trackbodyid": 2, + "distance": 4.0, + "lookat": np.array((0.0, 0.0, 1.15)), + "elevation": -20.0, +} + + +class Walker2dEnv(MujocoEnv, utils.EzPickle): + r""" + ## Description + This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment + by adding another set of legs making it possible for the robot to walk forward instead of + hop. Like other Mujoco environments, this environment aims to increase the number of independent state + and control variables as compared to the classic control environments. The walker is a + two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top + (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs + in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. + The goal is to walk in the in the forward (right) + direction by applying torques on the six hinges connecting the seven body parts. + + + ## Action Space + The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| + | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | + | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | + | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | + | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | + | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | + | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | + + + ## Observation Space + Observations consist of positional values of different body parts of the walker, + followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + + By default, observations do not include the x-coordinate of the torso. It may + be included by passing `exclude_current_positions_from_observation=False` during construction. + In that case, the observation space will be `Box(-Inf, Inf, (18,), float64)` where the first observation + represent the x-coordinates of the torso of the walker. + Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate + of the torso will be returned in `info` with key `"x_position"`. + + By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | + | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | + | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | + | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | + | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | + | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | + | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | + | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | + | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | + | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | + | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | + | 9 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | + | 10 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | + | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | + | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | + | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | + | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | + | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | + | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | + + + ## Rewards + The reward consists of three parts: + - *healthy_reward*: + Every timestep that the Walker2d is alive, it receives a fixed reward of value `healthy_reward`, + - *forward_reward*: + A reward of moving forward, + this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). + $w_{forward} \times \frac{dx}{dt}$, where + $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), + $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 4), + and `frametime` which is 0.002 - making the default $dt = 4 \times 0.002 = 0.008$, + $w_{forward}$ is the `forward_reward_weight` (default is $1$). + - *ctrl_cost*: + A negative reward for penalizing the Walker2d if it takes actions that are too large. + $w_{control} \times \\|action\\|_2^2$, + where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). + + The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* + and `info` will also contain the individual reward terms. + + + ## Starting State + All observations start in state + (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. + + + ## Episode End + #### termination + If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Walker2d is unhealthy. + The Walker2d is unhealthy if any of the following happens: + + 1. Any of the state space values is no longer finite + 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` + 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` + + #### truncation + the maximum duration of an episode is 1000 timesteps. + + If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. + + + ## Arguments + `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + + ```python + import gymnasium as gym + env = gym.make('Walker2d-v5', ctrl_cost_weight=1e-3, ...) + ``` + + | Parameter | Type | Default | Description | + | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `xml_file` | **str** |`"walker2d_v5.xml"`| Path to a MuJoCo model | + | `forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward) | + | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | + | `healthy_reward` | **float** | `1` | Weight for _healthy_reward_ reward (see section on reward) | + | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | + | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the torso of the walker must be in this range to be considered healthy | + | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | + | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | + | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | + + + ## Version History + * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 + * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) + * v2: All continuous control environments now use mujoco-py >= 1.50 + * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. + * v0: Initial versions release (1.0.0) + """ + + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + } + + def __init__( + self, + xml_file="walker2d_v5.xml", + frame_skip=4, + default_camera_config=DEFAULT_CAMERA_CONFIG, + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_z_range=(0.8, 2.0), + healthy_angle_range=(-1.0, 1.0), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True, + **kwargs, + ): + utils.EzPickle.__init__( + self, + xml_file, + frame_skip, + default_camera_config, + forward_reward_weight, + ctrl_cost_weight, + healthy_reward, + terminate_when_unhealthy, + healthy_z_range, + healthy_angle_range, + reset_noise_scale, + exclude_current_positions_from_observation, + **kwargs, + ) + + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + + self._healthy_z_range = healthy_z_range + self._healthy_angle_range = healthy_angle_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation + ) + + MujocoEnv.__init__( + self, + xml_file, + frame_skip, + observation_space=None, + default_camera_config=default_camera_config, + **kwargs, + ) + + self.metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": int(np.round(1.0 / self.dt)), + } + + obs_size = ( + self.data.qpos.size + + self.data.qvel.size + - exclude_current_positions_from_observation + ) + self.observation_space = Box( + low=-np.inf, high=np.inf, shape=(obs_size,), dtype=np.float64 + ) + + self.observation_structure = { + "skipped_qpos": 1 * exclude_current_positions_from_observation, + "qpos": self.data.qpos.size + - 1 * exclude_current_positions_from_observation, + "qvel": self.data.qvel.size, + } + + @property + def healthy_reward(self): + return self.is_healthy * self._healthy_reward + + def control_cost(self, action): + control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return control_cost + + @property + def is_healthy(self): + z, angle = self.data.qpos[1:3] + + min_z, max_z = self._healthy_z_range + min_angle, max_angle = self._healthy_angle_range + + healthy_z = min_z < z < max_z + healthy_angle = min_angle < angle < max_angle + is_healthy = healthy_z and healthy_angle + + return is_healthy + + @property + def terminated(self): + terminated = (not self.is_healthy) and self._terminate_when_unhealthy + # TODO remove after validation + assert terminated == ( + not self.is_healthy if self._terminate_when_unhealthy else False + ) + return terminated + + def _get_obs(self): + position = self.data.qpos.flat.copy() + velocity = np.clip(self.data.qvel.flat.copy(), -10, 10) + + if self._exclude_current_positions_from_observation: + position = position[1:] + + observation = np.concatenate((position, velocity)).ravel() + return observation + + def step(self, action): + x_position_before = self.data.qpos[0] + self.do_simulation(action, self.frame_skip) + x_position_after = self.data.qpos[0] + x_velocity = (x_position_after - x_position_before) / self.dt + + ctrl_cost = self.control_cost(action) + + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + + rewards = forward_reward + healthy_reward + costs = ctrl_cost + + observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated + info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_survive": healthy_reward, + "x_position": x_position_after, + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + "x_velocity": x_velocity, + } + + if self.render_mode == "human": + self.render() + + return observation, reward, terminated, False, info + + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq + ) + qvel = self.init_qvel + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nv + ) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def _get_reset_info(self): + return { + "x_position": self.data.qpos[0], + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + } diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py new file mode 100644 index 000000000..ab53a6474 --- /dev/null +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -0,0 +1,471 @@ +import gymnasium as gym +import pytest +import numpy as np +from gymnasium.utils.env_checker import check_env +import mujoco +from gymnasium.error import Error +import warnings + +ALL_MUJOCO_ENVS = ['Ant', 'HalfCheetah', 'Hopper', 'Humanoid', 'HumanoidStandup', 'InvertedDoublePendulum', 'InvertedPendulum', "Pusher", "Reacher", "Swimmer", 'Walker2d'] + + +# Note: "HumnanoidStandup-v4" does not have `info` +# Note: "Humnanoid-v4/3" & "Ant-v4/3" fail this test +@pytest.mark.parametrize("env_id", ['Ant-v5', 'HalfCheetah-v5', 'HalfCheetah-v4', 'HalfCheetah-v3', 'Hopper-v5', 'Hopper-v4', 'Hopper-v3', 'Humanoid-v5', 'HumanoidStandup-v5', 'Swimmer-v5', 'Swimmer-v4', 'Swimmer-v3', 'Walker2d-v5', 'Walker2d-v4', 'Walker2d-v3']) +def test_verify_info_x_position(env_id): + """Asserts that the environment has position[0] == info['x_position']""" + env = gym.make(env_id, exclude_current_positions_from_observation=False) + + _, _ = env.reset() + obs, _, _, _, info = env.step(env.action_space.sample()) + + assert obs[0] == info['x_position'] + + +# Note: "HumnanoidStandup-v4" does not have `info` +# Note: "Humnanoid-v4/3" & "Ant-v4/3" fail this test +@pytest.mark.parametrize("env_id", ['Ant-v5', 'Humanoid-v5', 'HumanoidStandup-v5', 'Swimmer-v5']) +def test_verify_info_y_position(env_id): + """Asserts that the environment has position[1] == info['y_position']""" + env = gym.make(env_id, exclude_current_positions_from_observation=False) + + _, _ = env.reset() + obs, _, _, _, info = env.step(env.action_space.sample()) + + assert obs[1] == info['y_position'] + + +# Note: "HumnanoidStandup-v4" does not have `info` +@pytest.mark.parametrize("env", ['HalfCheetah', 'Hopper', 'Swimmer', 'Walker2d']) +@pytest.mark.parametrize("version", ['v5', 'v4', 'v3']) +def test_verify_info_x_velocity(env, version): + """Asserts that the environment `info['x_velocity']` is properly assigned""" + env = gym.make(f"{env}-{version}") + env.reset() + + old_x = env.unwrapped.data.qpos[0] + _, _, _, _, info = env.step(env.action_space.sample()) + new_x = env.unwrapped.data.qpos[0] + + dx = new_x - old_x + vel_x = dx / env.dt + assert vel_x == info['x_velocity'] + + +# Note: "HumnanoidStandup-v4" does not have `info` +@pytest.mark.parametrize("env_id", ['Swimmer-v5', 'Swimmer-v4', 'Swimmer-v3']) +def test_verify_info_y_velocity(env_id): + """Asserts that the environment `info['y_velocity']` is properly assigned""" + env = gym.make(env_id) + env.reset() + + old_y = env.unwrapped.data.qpos[1] + _, _, _, _, info = env.step(env.action_space.sample()) + new_y = env.unwrapped.data.qpos[1] + + dy = new_y - old_y + vel_y = dy / env.dt + assert vel_y == info['y_velocity'] + + +@pytest.mark.parametrize("env_id", ['Ant-v5', 'Ant-v4', 'Ant-v3']) +def test_verify_info_xy_velocity_xpos(env_id): + """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the ant environment which uses kinmatics for the velocity""" + env = gym.make(env_id) + env.reset() + + old_xy = env.get_body_com("torso")[:2].copy() + _, _, _, _, info = env.step(env.action_space.sample()) + new_xy = env.get_body_com("torso")[:2].copy() + + dxy = new_xy - old_xy + vel_x, vel_y = dxy / env.dt + assert vel_x == info['x_velocity'] + assert vel_y == info['y_velocity'] + + +@pytest.mark.parametrize("env_id", ['Humanoid-v5', 'Humanoid-v4', 'Humanoid-v3']) +def test_verify_info_xy_velocity_com(env_id): + """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the humanoid environment which uses kinmatics of Center Of Mass for the velocity""" + + def mass_center(model, data): + mass = np.expand_dims(model.body_mass, axis=1) + xpos = data.xipos + return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() + + env = gym.make(env_id) + env.reset() + + old_xy = mass_center(env.unwrapped.model, env.unwrapped.data) + _, _, _, _, info = env.step(env.action_space.sample()) + new_xy = mass_center(env.unwrapped.model, env.unwrapped.data) + + dxy = new_xy - old_xy + vel_x, vel_y = dxy / env.dt + assert vel_x == info['x_velocity'] + assert vel_y == info['y_velocity'] + + +# Note: Hopper-v4/3/2 does not have `info['reward_survive']`, but it is still affected +# Note: Walker2d-v4/3/2 does not have `info['reward_survive']`, but it is still affected +# Note: Inverted(Double)Pendulum-v4/2 does not have `info['reward_survive']`, but it is still affected +# Note: all `v4/v3/v2` environments with a heathly reward are fail this test +@pytest.mark.parametrize("env_id", ['Ant-v5', 'Hopper-v5', 'Humanoid-v5', 'InvertedDoublePendulum-v5', 'InvertedPendulum-v5', 'Walker2d-v5']) +def test_verify_reward_survive(env_id): + """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states""" + env = gym.make(env_id, reset_noise_scale=0) + env.reset(seed=0) + env.action_space.seed(0) + + for step in range(175): + obs, rew, terminal, truncated, info = env.step(env.action_space.sample()) + + if terminal: + assert info['reward_survive'] == 0 + break + + assert info['reward_survive'] != 0 + + assert terminal, "The environment, should have terminated, if not the test is not valid." + + +CHECK_ENV_IGNORE_WARNINGS = [ + f"\x1b[33mWARN: {message}\x1b[0m" + for message in [ + "A Box observation space minimum value is -infinity. This is probably too low.", + "A Box observation space maximum value is -infinity. This is probably too high.", + "For Box action spaces, we recommend using a symmetric and normalized space (range=[-1, 1] or [0, 1]). See https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html for more information.", + ] +] + + +@pytest.mark.parametrize("env", ALL_MUJOCO_ENVS) +@pytest.mark.parametrize("version", ['v5']) +@pytest.mark.parametrize("frame_skip", [1, 2, 3, 4, 5]) +def test_frame_skip(env, version, frame_skip): + """Verify that all `mujoco` envs work with different `frame_skip` values""" + env_id = f"{env}-{version}" + env = gym.make(env_id, frame_skip=frame_skip) + + # Test if env adheres to Gym API + with warnings.catch_warnings(record=True) as w: + gym.utils.env_checker.check_env(env.unwrapped, skip_render_check=True) + env.close() + for warning in w: + if warning.message.args[0] not in CHECK_ENV_IGNORE_WARNINGS: + raise Error(f"Unexpected warning: {warning.message}") + + +# Dev Note: This can be version env parametrized because each env has it's own reward function +@pytest.mark.parametrize("version", ['v5']) +def test_reward_sum(version): + """Assert that the total reward equals the sum of the individual reward terms.""" + env = gym.make(f'Ant-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_contact"] - info["reward_survive"] < 1e-14 + + env = gym.make(f'HalfCheetah-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_forward"] - info["reward_ctrl"] < 1e-14 + + env = gym.make(f'Hopper-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] < 1e-14 + + env = gym.make(f'Humanoid-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_contact"] - info["reward_survive"] < 1e-14 + + env = gym.make(f'HumanoidStandup-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_linup"] - info["reward_quadctrl"] - info["reward_impact"] - 1 < 1e-14 + + env = gym.make(f'InvertedDoublePendulum-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_survive"] - info["distance_penalty"] - info["velocity_penalty"] < 1e-14 + + env = gym.make(f'InvertedPendulum-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_survive"] + + env = gym.make(f'Pusher-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_dist"] - info["reward_ctrl"] - info["reward_near"] < 1e-14 + + env = gym.make(f'Reacher-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_dist"] - info["reward_ctrl"] < 1e-14 + + env = gym.make(f'Swimmer-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_forward"] - info["reward_ctrl"] < 1e-14 + + env = gym.make(f'Walker2d-{version}') + env.reset() + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] < 1e-14 + + +# Note: the environtments that are not present, is because they do not have identical behaviour +@pytest.mark.parametrize("env", ['HalfCheetah', 'HumanoidStandup', 'Pusher', 'Reacher', 'Swimmer']) +def test_identical_behaviour_v45(env): + """Verify that v4 -> v5 transition. does not change the behaviour of the environments in way way""" + env_v4 = gym.make(env + '-v4') + env_v5 = gym.make(env + '-v5') + env_v4.reset(seed=1234) + env_v5.reset(seed=1234) + action = env_v4.action_space.sample() + obs_v4, rew_v4, terminal_v4, truncated_v4, info_v4 = env_v4.step(action) + obs_v5, rew_v5, terminal_v5, truncated_v5, info_v5 = env_v5.step(action) + assert obs_v4.shape[0] != 1 + assert obs_v5.shape[0] != 1 + assert (env_v4.unwrapped.data.qpos == env_v5.unwrapped.data.qpos).all() + assert (env_v4.unwrapped.data.qvel == env_v5.unwrapped.data.qvel).all() + if env not in ['HumanoidStandup', 'Reacher']: # they have different obs + assert (obs_v4 == obs_v5).all() + assert rew_v4 == rew_v5 + assert terminal_v4 == terminal_v5 and truncated_v4 == truncated_v5 + + +@pytest.mark.parametrize("version", ['v5', 'v4']) +def test_ant_com(version): + """Verify the kinmatic behaviour of the ant""" + env = gym.make(f'Ant-{version}') # `env` contains `data : MjData` and `model : MjModel` + env.reset() # randomly initlizies the `data.qpos` and `data.qvel`, calls mujoco.mj_forward(env.model, env.data) + + x_position_before = env.unwrapped.data.qpos[0] + x_position_before_com = env.unwrapped.data.body("torso").xpos[0] + assert x_position_before == x_position_before_com, "before failed" # This succeeds + + random_control = env.action_space.sample() + _, _, _, _, info = env.step(random_control) # This calls mujoco.mj_step(env.model, env.data, nstep=env.frame_skip) + mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) + + x_position_after = env.unwrapped.data.qpos[0] + x_position_after_com = env.unwrapped.data.body("torso").xpos[0] + assert x_position_after == x_position_after_com, "after failed" # This succeeds + + +@pytest.mark.parametrize("version", ['v5', 'v4', 'v3', 'v2']) +def test_set_state(version): + """Simple Test to verify that `mujocoEnv.set_state()` works correctly""" + env = gym.make(f'Hopper-{version}') + env.reset() + new_qpos = np.array([0.00136962, 1.24769787, -0.00459026, -0.00483472, 0.0031327 , 0.00412756]) + new_qvel = np.array([0.00106636, 0.00229497, 0.00043625, 0.00435072, 0.00315854, -0.00497261]) + env.set_state(new_qpos, new_qvel) + assert (env.unwrapped.data.qpos == new_qpos).all() + assert (env.unwrapped.data.qvel == new_qvel).all() + + +# Note: HumanoidStandup-v4/v3 does not have `info` +# Note: Ant-v4/v3 fails this test +# Note: Humanoid-v4/v3 fails this test +# Note: v2 does not have `info` +@pytest.mark.parametrize("env_id", ['Ant-v5', 'Humanoid-v5', 'Swimmer-v5', 'Swimmer-v4', 'Swimmer-v3']) +def test_distance_from_origin_info(env_id): + """Verify that `info"distance_from_origin"` is correct""" + env = gym.make(env_id) + env.reset() + _, _, _, _, info = env.step(env.action_space.sample()) + assert info["distance_from_origin"] == np.linalg.norm(env.unwrapped.data.qpos[0:2] - env.init_qpos[0:2]) + + +@pytest.mark.parametrize("env_id", ['Hopper-v5', 'HumanoidStandup-v5', 'Walker2d-v5']) +def test_z_distance_from_origin_info(env_id): + """Verify that `info"z_distance_from_origin"` is correct""" + env = gym.make(env_id) + env.reset() + _, _, _, _, info = env.step(env.action_space.sample()) + mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) + z_index = env.observation_structure["skipped_qpos"] + assert info["z_distance_from_origin"] == env.unwrapped.data.qpos[z_index] - env.init_qpos[z_index] + + +@pytest.mark.parametrize("env_id", ['Ant-v5', 'HalfCheetah-v5', 'Hopper-v5', 'Humanoid-v5', 'HumanoidStandup-v5', 'InvertedPendulum-v5', 'Swimmer-v5', 'Walker2d-v5']) +def test_observation_structure(env_id): + """Verify that the `env.observation_structure` is properly defined.""" + env = gym.make(env_id) + if hasattr(env, "observation_structure"): + return + + obs_struct = env.observation_structure + + assert env.unwrapped.model.nq == obs_struct.get("skipped_qpos", 0) + obs_struct["qpos"] + assert env.unwrapped.model.nv == obs_struct["qvel"] + if obs_struct.get("cinert", 0): + assert (env.unwrapped.model.nbody - 1) * 10 == obs_struct["cinert"] + if obs_struct.get("cvel", 0): + assert (env.unwrapped.model.nbody - 1) * 6 == obs_struct["cvel"] + if obs_struct.get("qfrc_actuator", 0): + assert env.unwrapped.model.nv - 6 == obs_struct["qfrc_actuator"] + if obs_struct.get("cfrc_ext", 0): + assert (env.unwrapped.model.nbody - 1) * 6 == obs_struct["cfrc_ext"] + if obs_struct.get("ten_lenght", 0): + assert env.unwrapped.model.ntendon == obs_struct["ten_lenght"] + if obs_struct.get("ten_velocity", 0): + assert env.unwrapped.model.ntendon == obs_struct["ten_velocity"] + + +@pytest.mark.parametrize("env_id", ['Ant-v5', 'HalfCheetah-v5', 'Hopper-v5', 'Humanoid-v5', 'HumanoidStandup-v5', 'Swimmer-v5', 'Walker2d-v5']) +def test_reset_info(env_id): + """Verify that the environmet returns info at `reset()`""" + env = gym.make(env_id) + _, reset_info = env.reset() + assert reset_info.get('x_position') + + +""" +[Bug Report] [Documentation] Inverted Double Pendulum max Height is wrong + +The Documentation States: +```md +The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other) +``` +but the height of each pole is 0.6 (0.6+0.6==1.2) +https://github.com/Farama-Foundation/Gymnasium/blob/deb50802facfd827abd4d1f0cf1069afb12a726b/gymnasium/envs/mujoco/assets/inverted_double_pendulum.xml#L33-L39 +""" + + +# Note: the max height used to be wrong in the documention. +@pytest.mark.parametrize("version", ['v5']) +def test_inverted_double_pendulum_max_height(version): + """Verify the max height of Inverted Double Pendulum""" + env = gym.make(f"InvertedDoublePendulum-{version}", reset_noise_scale=0) + env.reset() + y = env.unwrapped.data.site_xpos[0][2] + assert y == 1.2 + + +@pytest.mark.parametrize("version", ['v4']) +def test_inverted_double_pendulum_max_height_old(version): + """Verify the max height of Inverted Double Pendulum (v4 does not have `reset_noise_scale` argument)""" + env = gym.make(f"InvertedDoublePendulum-{version}") + env.set_state(env.init_qpos, env.init_qvel) + y = env.unwrapped.data.site_xpos[0][2] + assert y == 1.2 + + +# note: fails with `brax==0.9.0` +@pytest.mark.parametrize("version", ['v5', 'v4']) +def test_model_object_count(version): + """Verify that all the objects of the model are loaded, mostly usefull for using non-mujoco simulator.""" + env = gym.make(f'Ant-{version}') + assert env.unwrapped.model.nq == 15 + assert env.unwrapped.model.nv == 14 + assert env.unwrapped.model.nu == 8 + assert env.unwrapped.model.nbody == 14 + assert env.unwrapped.model.nbvh == 14 + assert env.unwrapped.model.njnt == 9 + assert env.unwrapped.model.ngeom == 14 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'HalfCheetah-{version}') + assert env.unwrapped.model.nq == 9 + assert env.unwrapped.model.nv == 9 + assert env.unwrapped.model.nu == 6 + assert env.unwrapped.model.nbody == 8 + assert env.unwrapped.model.nbvh == 10 + assert env.unwrapped.model.njnt == 9 + assert env.unwrapped.model.ngeom == 9 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'Hopper-{version}') + assert env.unwrapped.model.nq == 6 + assert env.unwrapped.model.nv == 6 + assert env.unwrapped.model.nu == 3 + assert env.unwrapped.model.nbody == 5 + assert env.unwrapped.model.nbvh == 5 + assert env.unwrapped.model.njnt == 6 + assert env.unwrapped.model.ngeom == 5 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'Humanoid-{version}') + assert env.unwrapped.model.nq == 24 + assert env.unwrapped.model.nv == 23 + assert env.unwrapped.model.nu == 17 + assert env.unwrapped.model.nbody == 14 + assert env.unwrapped.model.nbvh == 22 + assert env.unwrapped.model.njnt == 18 + assert env.unwrapped.model.ngeom == 18 + assert env.unwrapped.model.ntendon == 2 + + env = gym.make(f'HumanoidStandup-{version}') + assert env.unwrapped.model.nq == 24 + assert env.unwrapped.model.nv == 23 + assert env.unwrapped.model.nu == 17 + assert env.unwrapped.model.nbody == 14 + assert env.unwrapped.model.nbvh == 22 + assert env.unwrapped.model.njnt == 18 + assert env.unwrapped.model.ngeom == 18 + assert env.unwrapped.model.ntendon == 2 + + env = gym.make(f'InvertedDoublePendulum-{version}') + assert env.unwrapped.model.nq == 3 + assert env.unwrapped.model.nv == 3 + assert env.unwrapped.model.nu == 1 + assert env.unwrapped.model.nbody == 4 + assert env.unwrapped.model.nbvh == 6 + assert env.unwrapped.model.njnt == 3 + assert env.unwrapped.model.ngeom == 5 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'InvertedPendulum-{version}') + assert env.unwrapped.model.nq == 2 + assert env.unwrapped.model.nv == 2 + assert env.unwrapped.model.nu == 1 + assert env.unwrapped.model.nbody == 3 + assert env.unwrapped.model.nbvh == 3 + assert env.unwrapped.model.njnt == 2 + assert env.unwrapped.model.ngeom == 3 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'Pusher-{version}') + assert env.unwrapped.model.nq == 11 + assert env.unwrapped.model.nv == 11 + assert env.unwrapped.model.nu == 7 + assert env.unwrapped.model.nbody == 13 + assert env.unwrapped.model.nbvh == 18 + assert env.unwrapped.model.njnt == 11 + assert env.unwrapped.model.ngeom == 21 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'Reacher-{version}') + assert env.unwrapped.model.nq == 4 + assert env.unwrapped.model.nv == 4 + assert env.unwrapped.model.nu == 2 + assert env.unwrapped.model.nbody == 5 + assert env.unwrapped.model.nbvh == 5 + assert env.unwrapped.model.njnt == 4 + assert env.unwrapped.model.ngeom == 10 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'Swimmer-{version}') + assert env.unwrapped.model.nq == 5 + assert env.unwrapped.model.nv == 5 + assert env.unwrapped.model.nu == 2 + assert env.unwrapped.model.nbody == 4 + assert env.unwrapped.model.nbvh == 4 + assert env.unwrapped.model.njnt == 5 + assert env.unwrapped.model.ngeom == 4 + assert env.unwrapped.model.ntendon == 0 + + env = gym.make(f'Walker2d-{version}') + assert env.unwrapped.model.nq == 9 + assert env.unwrapped.model.nv == 9 + assert env.unwrapped.model.nu == 6 + assert env.unwrapped.model.nbody == 8 + assert env.unwrapped.model.nbvh == 8 + assert env.unwrapped.model.njnt == 9 + assert env.unwrapped.model.ngeom == 8 + assert env.unwrapped.model.ntendon == 0 From c3077ef4aa3c631fb375e5658a825879f305e1f1 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 27 Jun 2023 22:50:26 +0300 Subject: [PATCH 10/53] `pre-commit` --- gymnasium/envs/mujoco/ant_v5.py | 10 +- gymnasium/envs/mujoco/half_cheetah_v5.py | 2 + gymnasium/envs/mujoco/hopper_v5.py | 2 + gymnasium/envs/mujoco/humanoid_v5.py | 2 + gymnasium/envs/mujoco/humanoidstandup_v5.py | 2 + .../mujoco/inverted_double_pendulum_v5.py | 2 + gymnasium/envs/mujoco/inverted_pendulum_v5.py | 2 + gymnasium/envs/mujoco/pusher_v5.py | 2 + gymnasium/envs/mujoco/reacher_v5.py | 2 + gymnasium/envs/mujoco/swimmer_v5.py | 1 + gymnasium/envs/mujoco/walker2d_v5.py | 2 + tests/envs/mujoco/test_mujoco_v5.py | 296 +++++++++++++----- 12 files changed, 235 insertions(+), 90 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index dcf2d3887..38b00d503 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -3,10 +3,12 @@ from typing import Dict, Tuple, Union import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "distance": 4.0, } @@ -191,7 +193,7 @@ class AntEnv(MujocoEnv, utils.EzPickle): |`ctrl_cost_weight` | **float** | `0.5` | Weight for _ctrl_cost_ term (see section on reward) | |`contact_cost_weight` | **float** | `5e-4` | Weight for _contact_cost_ term (see section on reward) | |`healthy_reward` | **float** | `1` | Weight for _healthy_reward_ term (see section on reward) | - |`main_body` |**str|int** | `1`("torso") | Name or ID of the body, whose diplacement is used to calculate the *dx*/_forward_reward_ (useful for custom MuJoCo models)| + |`main_body` |**str|int** | `1`("torso") | Name or ID of the body, whose displacement is used to calculate the *dx*/_forward_reward_ (useful for custom MuJoCo models)| |`terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | |`healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range | |`contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* | @@ -232,7 +234,7 @@ def __init__( reset_noise_scale: float = 0.1, exclude_current_positions_from_observation: bool = True, include_cfrc_ext_in_observation: bool = True, - **kwargs + **kwargs, ): utils.EzPickle.__init__( self, @@ -250,7 +252,7 @@ def __init__( reset_noise_scale, exclude_current_positions_from_observation, include_cfrc_ext_in_observation, - **kwargs + **kwargs, ) self._forward_reward_weight = forward_reward_weight @@ -278,7 +280,7 @@ def __init__( frame_skip, observation_space=None, # needs to be defined after default_camera_config=default_camera_config, - **kwargs + **kwargs, ) self.metadata = { diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py index d0ead9924..b02082bfe 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v5.py +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas", "Rushiv Arora"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "distance": 4.0, } diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 1d808a324..082a77c64 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "trackbodyid": 2, "distance": 3.0, diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index b45afc80b..6c4245d99 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "trackbodyid": 1, "distance": 4.0, diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index ba13b9f13..72a419046 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "trackbodyid": 1, "distance": 4.0, diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py index 028d856e5..8e0d4eaaf 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "trackbodyid": 0, "distance": 4.1225, diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_pendulum_v5.py index ac3f940f0..4be166b0c 100644 --- a/gymnasium/envs/mujoco/inverted_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_pendulum_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "trackbodyid": 0, "distance": 2.04, diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index 58310f28c..f0fc6b059 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "trackbodyid": -1, "distance": 4.0, diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index 2773bc034..2227e848d 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 2af6334b5..7f64fa113 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -1,6 +1,7 @@ __credits__ = ["Kallinteris-Andreas", "Rushiv Arora"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index e1b790e99..8c286841c 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -1,10 +1,12 @@ __credits__ = ["Kallinteris-Andreas"] import numpy as np + from gymnasium import utils from gymnasium.envs.mujoco import MujocoEnv from gymnasium.spaces import Box + DEFAULT_CAMERA_CONFIG = { "trackbodyid": 2, "distance": 4.0, diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index ab53a6474..10a1df169 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -1,17 +1,50 @@ -import gymnasium as gym -import pytest -import numpy as np -from gymnasium.utils.env_checker import check_env +import warnings + import mujoco +import numpy as np +import pytest + +import gymnasium as gym from gymnasium.error import Error -import warnings -ALL_MUJOCO_ENVS = ['Ant', 'HalfCheetah', 'Hopper', 'Humanoid', 'HumanoidStandup', 'InvertedDoublePendulum', 'InvertedPendulum', "Pusher", "Reacher", "Swimmer", 'Walker2d'] + +ALL_MUJOCO_ENVS = [ + "Ant", + "HalfCheetah", + "Hopper", + "Humanoid", + "HumanoidStandup", + "InvertedDoublePendulum", + "InvertedPendulum", + "Pusher", + "Reacher", + "Swimmer", + "Walker2d", +] # Note: "HumnanoidStandup-v4" does not have `info` # Note: "Humnanoid-v4/3" & "Ant-v4/3" fail this test -@pytest.mark.parametrize("env_id", ['Ant-v5', 'HalfCheetah-v5', 'HalfCheetah-v4', 'HalfCheetah-v3', 'Hopper-v5', 'Hopper-v4', 'Hopper-v3', 'Humanoid-v5', 'HumanoidStandup-v5', 'Swimmer-v5', 'Swimmer-v4', 'Swimmer-v3', 'Walker2d-v5', 'Walker2d-v4', 'Walker2d-v3']) +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "HalfCheetah-v5", + "HalfCheetah-v4", + "HalfCheetah-v3", + "Hopper-v5", + "Hopper-v4", + "Hopper-v3", + "Humanoid-v5", + "HumanoidStandup-v5", + "Swimmer-v5", + "Swimmer-v4", + "Swimmer-v3", + "Walker2d-v5", + "Walker2d-v4", + "Walker2d-v3", + ], +) def test_verify_info_x_position(env_id): """Asserts that the environment has position[0] == info['x_position']""" env = gym.make(env_id, exclude_current_positions_from_observation=False) @@ -19,12 +52,14 @@ def test_verify_info_x_position(env_id): _, _ = env.reset() obs, _, _, _, info = env.step(env.action_space.sample()) - assert obs[0] == info['x_position'] + assert obs[0] == info["x_position"] # Note: "HumnanoidStandup-v4" does not have `info` # Note: "Humnanoid-v4/3" & "Ant-v4/3" fail this test -@pytest.mark.parametrize("env_id", ['Ant-v5', 'Humanoid-v5', 'HumanoidStandup-v5', 'Swimmer-v5']) +@pytest.mark.parametrize( + "env_id", ["Ant-v5", "Humanoid-v5", "HumanoidStandup-v5", "Swimmer-v5"] +) def test_verify_info_y_position(env_id): """Asserts that the environment has position[1] == info['y_position']""" env = gym.make(env_id, exclude_current_positions_from_observation=False) @@ -32,12 +67,12 @@ def test_verify_info_y_position(env_id): _, _ = env.reset() obs, _, _, _, info = env.step(env.action_space.sample()) - assert obs[1] == info['y_position'] + assert obs[1] == info["y_position"] # Note: "HumnanoidStandup-v4" does not have `info` -@pytest.mark.parametrize("env", ['HalfCheetah', 'Hopper', 'Swimmer', 'Walker2d']) -@pytest.mark.parametrize("version", ['v5', 'v4', 'v3']) +@pytest.mark.parametrize("env", ["HalfCheetah", "Hopper", "Swimmer", "Walker2d"]) +@pytest.mark.parametrize("version", ["v5", "v4", "v3"]) def test_verify_info_x_velocity(env, version): """Asserts that the environment `info['x_velocity']` is properly assigned""" env = gym.make(f"{env}-{version}") @@ -49,11 +84,11 @@ def test_verify_info_x_velocity(env, version): dx = new_x - old_x vel_x = dx / env.dt - assert vel_x == info['x_velocity'] + assert vel_x == info["x_velocity"] # Note: "HumnanoidStandup-v4" does not have `info` -@pytest.mark.parametrize("env_id", ['Swimmer-v5', 'Swimmer-v4', 'Swimmer-v3']) +@pytest.mark.parametrize("env_id", ["Swimmer-v5", "Swimmer-v4", "Swimmer-v3"]) def test_verify_info_y_velocity(env_id): """Asserts that the environment `info['y_velocity']` is properly assigned""" env = gym.make(env_id) @@ -65,10 +100,10 @@ def test_verify_info_y_velocity(env_id): dy = new_y - old_y vel_y = dy / env.dt - assert vel_y == info['y_velocity'] + assert vel_y == info["y_velocity"] -@pytest.mark.parametrize("env_id", ['Ant-v5', 'Ant-v4', 'Ant-v3']) +@pytest.mark.parametrize("env_id", ["Ant-v5", "Ant-v4", "Ant-v3"]) def test_verify_info_xy_velocity_xpos(env_id): """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the ant environment which uses kinmatics for the velocity""" env = gym.make(env_id) @@ -80,11 +115,11 @@ def test_verify_info_xy_velocity_xpos(env_id): dxy = new_xy - old_xy vel_x, vel_y = dxy / env.dt - assert vel_x == info['x_velocity'] - assert vel_y == info['y_velocity'] + assert vel_x == info["x_velocity"] + assert vel_y == info["y_velocity"] -@pytest.mark.parametrize("env_id", ['Humanoid-v5', 'Humanoid-v4', 'Humanoid-v3']) +@pytest.mark.parametrize("env_id", ["Humanoid-v5", "Humanoid-v4", "Humanoid-v3"]) def test_verify_info_xy_velocity_com(env_id): """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the humanoid environment which uses kinmatics of Center Of Mass for the velocity""" @@ -102,15 +137,25 @@ def mass_center(model, data): dxy = new_xy - old_xy vel_x, vel_y = dxy / env.dt - assert vel_x == info['x_velocity'] - assert vel_y == info['y_velocity'] + assert vel_x == info["x_velocity"] + assert vel_y == info["y_velocity"] # Note: Hopper-v4/3/2 does not have `info['reward_survive']`, but it is still affected # Note: Walker2d-v4/3/2 does not have `info['reward_survive']`, but it is still affected # Note: Inverted(Double)Pendulum-v4/2 does not have `info['reward_survive']`, but it is still affected # Note: all `v4/v3/v2` environments with a heathly reward are fail this test -@pytest.mark.parametrize("env_id", ['Ant-v5', 'Hopper-v5', 'Humanoid-v5', 'InvertedDoublePendulum-v5', 'InvertedPendulum-v5', 'Walker2d-v5']) +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "Hopper-v5", + "Humanoid-v5", + "InvertedDoublePendulum-v5", + "InvertedPendulum-v5", + "Walker2d-v5", + ], +) def test_verify_reward_survive(env_id): """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states""" env = gym.make(env_id, reset_noise_scale=0) @@ -121,12 +166,14 @@ def test_verify_reward_survive(env_id): obs, rew, terminal, truncated, info = env.step(env.action_space.sample()) if terminal: - assert info['reward_survive'] == 0 + assert info["reward_survive"] == 0 break - assert info['reward_survive'] != 0 + assert info["reward_survive"] != 0 - assert terminal, "The environment, should have terminated, if not the test is not valid." + assert ( + terminal + ), "The environment, should have terminated, if not the test is not valid." CHECK_ENV_IGNORE_WARNINGS = [ @@ -140,7 +187,7 @@ def test_verify_reward_survive(env_id): @pytest.mark.parametrize("env", ALL_MUJOCO_ENVS) -@pytest.mark.parametrize("version", ['v5']) +@pytest.mark.parametrize("version", ["v5"]) @pytest.mark.parametrize("frame_skip", [1, 2, 3, 4, 5]) def test_frame_skip(env, version, frame_skip): """Verify that all `mujoco` envs work with different `frame_skip` values""" @@ -157,71 +204,108 @@ def test_frame_skip(env, version, frame_skip): # Dev Note: This can be version env parametrized because each env has it's own reward function -@pytest.mark.parametrize("version", ['v5']) +@pytest.mark.parametrize("version", ["v5"]) def test_reward_sum(version): """Assert that the total reward equals the sum of the individual reward terms.""" - env = gym.make(f'Ant-{version}') + env = gym.make(f"Ant-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_contact"] - info["reward_survive"] < 1e-14 - - env = gym.make(f'HalfCheetah-{version}') + assert ( + reward + - info["reward_forward"] + - info["reward_ctrl"] + - info["reward_contact"] + - info["reward_survive"] + < 1e-14 + ) + + env = gym.make(f"HalfCheetah-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) assert reward - info["reward_forward"] - info["reward_ctrl"] < 1e-14 - env = gym.make(f'Hopper-{version}') + env = gym.make(f"Hopper-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] < 1e-14 + assert ( + reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] + < 1e-14 + ) - env = gym.make(f'Humanoid-{version}') + env = gym.make(f"Humanoid-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_contact"] - info["reward_survive"] < 1e-14 - - env = gym.make(f'HumanoidStandup-{version}') + assert ( + reward + - info["reward_forward"] + - info["reward_ctrl"] + - info["reward_contact"] + - info["reward_survive"] + < 1e-14 + ) + + env = gym.make(f"HumanoidStandup-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_linup"] - info["reward_quadctrl"] - info["reward_impact"] - 1 < 1e-14 + assert ( + reward + - info["reward_linup"] + - info["reward_quadctrl"] + - info["reward_impact"] + - 1 + < 1e-14 + ) - env = gym.make(f'InvertedDoublePendulum-{version}') + env = gym.make(f"InvertedDoublePendulum-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_survive"] - info["distance_penalty"] - info["velocity_penalty"] < 1e-14 - - env = gym.make(f'InvertedPendulum-{version}') + assert ( + reward + - info["reward_survive"] + - info["distance_penalty"] + - info["velocity_penalty"] + < 1e-14 + ) + + env = gym.make(f"InvertedPendulum-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) assert reward == info["reward_survive"] - env = gym.make(f'Pusher-{version}') + env = gym.make(f"Pusher-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_dist"] - info["reward_ctrl"] - info["reward_near"] < 1e-14 + assert ( + reward - info["reward_dist"] - info["reward_ctrl"] - info["reward_near"] < 1e-14 + ) - env = gym.make(f'Reacher-{version}') + env = gym.make(f"Reacher-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) assert reward - info["reward_dist"] - info["reward_ctrl"] < 1e-14 - env = gym.make(f'Swimmer-{version}') + env = gym.make(f"Swimmer-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) assert reward - info["reward_forward"] - info["reward_ctrl"] < 1e-14 - env = gym.make(f'Walker2d-{version}') + env = gym.make(f"Walker2d-{version}") env.reset() _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] < 1e-14 + assert ( + reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] + < 1e-14 + ) # Note: the environtments that are not present, is because they do not have identical behaviour -@pytest.mark.parametrize("env", ['HalfCheetah', 'HumanoidStandup', 'Pusher', 'Reacher', 'Swimmer']) +@pytest.mark.parametrize( + "env", ["HalfCheetah", "HumanoidStandup", "Pusher", "Reacher", "Swimmer"] +) def test_identical_behaviour_v45(env): """Verify that v4 -> v5 transition. does not change the behaviour of the environments in way way""" - env_v4 = gym.make(env + '-v4') - env_v5 = gym.make(env + '-v5') + env_v4 = gym.make(env + "-v4") + env_v5 = gym.make(env + "-v5") env_v4.reset(seed=1234) env_v5.reset(seed=1234) action = env_v4.action_space.sample() @@ -231,16 +315,18 @@ def test_identical_behaviour_v45(env): assert obs_v5.shape[0] != 1 assert (env_v4.unwrapped.data.qpos == env_v5.unwrapped.data.qpos).all() assert (env_v4.unwrapped.data.qvel == env_v5.unwrapped.data.qvel).all() - if env not in ['HumanoidStandup', 'Reacher']: # they have different obs + if env not in ["HumanoidStandup", "Reacher"]: # they have different obs assert (obs_v4 == obs_v5).all() assert rew_v4 == rew_v5 assert terminal_v4 == terminal_v5 and truncated_v4 == truncated_v5 -@pytest.mark.parametrize("version", ['v5', 'v4']) +@pytest.mark.parametrize("version", ["v5", "v4"]) def test_ant_com(version): """Verify the kinmatic behaviour of the ant""" - env = gym.make(f'Ant-{version}') # `env` contains `data : MjData` and `model : MjModel` + env = gym.make( + f"Ant-{version}" + ) # `env` contains `data : MjData` and `model : MjModel` env.reset() # randomly initlizies the `data.qpos` and `data.qvel`, calls mujoco.mj_forward(env.model, env.data) x_position_before = env.unwrapped.data.qpos[0] @@ -248,7 +334,9 @@ def test_ant_com(version): assert x_position_before == x_position_before_com, "before failed" # This succeeds random_control = env.action_space.sample() - _, _, _, _, info = env.step(random_control) # This calls mujoco.mj_step(env.model, env.data, nstep=env.frame_skip) + _, _, _, _, info = env.step( + random_control + ) # This calls mujoco.mj_step(env.model, env.data, nstep=env.frame_skip) mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) x_position_after = env.unwrapped.data.qpos[0] @@ -256,13 +344,17 @@ def test_ant_com(version): assert x_position_after == x_position_after_com, "after failed" # This succeeds -@pytest.mark.parametrize("version", ['v5', 'v4', 'v3', 'v2']) +@pytest.mark.parametrize("version", ["v5", "v4", "v3", "v2"]) def test_set_state(version): """Simple Test to verify that `mujocoEnv.set_state()` works correctly""" - env = gym.make(f'Hopper-{version}') + env = gym.make(f"Hopper-{version}") env.reset() - new_qpos = np.array([0.00136962, 1.24769787, -0.00459026, -0.00483472, 0.0031327 , 0.00412756]) - new_qvel = np.array([0.00106636, 0.00229497, 0.00043625, 0.00435072, 0.00315854, -0.00497261]) + new_qpos = np.array( + [0.00136962, 1.24769787, -0.00459026, -0.00483472, 0.0031327, 0.00412756] + ) + new_qvel = np.array( + [0.00106636, 0.00229497, 0.00043625, 0.00435072, 0.00315854, -0.00497261] + ) env.set_state(new_qpos, new_qvel) assert (env.unwrapped.data.qpos == new_qpos).all() assert (env.unwrapped.data.qvel == new_qvel).all() @@ -272,16 +364,20 @@ def test_set_state(version): # Note: Ant-v4/v3 fails this test # Note: Humanoid-v4/v3 fails this test # Note: v2 does not have `info` -@pytest.mark.parametrize("env_id", ['Ant-v5', 'Humanoid-v5', 'Swimmer-v5', 'Swimmer-v4', 'Swimmer-v3']) +@pytest.mark.parametrize( + "env_id", ["Ant-v5", "Humanoid-v5", "Swimmer-v5", "Swimmer-v4", "Swimmer-v3"] +) def test_distance_from_origin_info(env_id): """Verify that `info"distance_from_origin"` is correct""" env = gym.make(env_id) env.reset() _, _, _, _, info = env.step(env.action_space.sample()) - assert info["distance_from_origin"] == np.linalg.norm(env.unwrapped.data.qpos[0:2] - env.init_qpos[0:2]) + assert info["distance_from_origin"] == np.linalg.norm( + env.unwrapped.data.qpos[0:2] - env.init_qpos[0:2] + ) -@pytest.mark.parametrize("env_id", ['Hopper-v5', 'HumanoidStandup-v5', 'Walker2d-v5']) +@pytest.mark.parametrize("env_id", ["Hopper-v5", "HumanoidStandup-v5", "Walker2d-v5"]) def test_z_distance_from_origin_info(env_id): """Verify that `info"z_distance_from_origin"` is correct""" env = gym.make(env_id) @@ -289,10 +385,25 @@ def test_z_distance_from_origin_info(env_id): _, _, _, _, info = env.step(env.action_space.sample()) mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) z_index = env.observation_structure["skipped_qpos"] - assert info["z_distance_from_origin"] == env.unwrapped.data.qpos[z_index] - env.init_qpos[z_index] - - -@pytest.mark.parametrize("env_id", ['Ant-v5', 'HalfCheetah-v5', 'Hopper-v5', 'Humanoid-v5', 'HumanoidStandup-v5', 'InvertedPendulum-v5', 'Swimmer-v5', 'Walker2d-v5']) + assert ( + info["z_distance_from_origin"] + == env.unwrapped.data.qpos[z_index] - env.init_qpos[z_index] + ) + + +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "HalfCheetah-v5", + "Hopper-v5", + "Humanoid-v5", + "HumanoidStandup-v5", + "InvertedPendulum-v5", + "Swimmer-v5", + "Walker2d-v5", + ], +) def test_observation_structure(env_id): """Verify that the `env.observation_structure` is properly defined.""" env = gym.make(env_id) @@ -301,7 +412,9 @@ def test_observation_structure(env_id): obs_struct = env.observation_structure - assert env.unwrapped.model.nq == obs_struct.get("skipped_qpos", 0) + obs_struct["qpos"] + assert ( + env.unwrapped.model.nq == obs_struct.get("skipped_qpos", 0) + obs_struct["qpos"] + ) assert env.unwrapped.model.nv == obs_struct["qvel"] if obs_struct.get("cinert", 0): assert (env.unwrapped.model.nbody - 1) * 10 == obs_struct["cinert"] @@ -317,12 +430,23 @@ def test_observation_structure(env_id): assert env.unwrapped.model.ntendon == obs_struct["ten_velocity"] -@pytest.mark.parametrize("env_id", ['Ant-v5', 'HalfCheetah-v5', 'Hopper-v5', 'Humanoid-v5', 'HumanoidStandup-v5', 'Swimmer-v5', 'Walker2d-v5']) +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "HalfCheetah-v5", + "Hopper-v5", + "Humanoid-v5", + "HumanoidStandup-v5", + "Swimmer-v5", + "Walker2d-v5", + ], +) def test_reset_info(env_id): - """Verify that the environmet returns info at `reset()`""" + """Verify that the environment returns info at `reset()`""" env = gym.make(env_id) _, reset_info = env.reset() - assert reset_info.get('x_position') + assert reset_info.get("x_position") """ @@ -337,8 +461,8 @@ def test_reset_info(env_id): """ -# Note: the max height used to be wrong in the documention. -@pytest.mark.parametrize("version", ['v5']) +# Note: the max height used to be wrong in the documentation. +@pytest.mark.parametrize("version", ["v5"]) def test_inverted_double_pendulum_max_height(version): """Verify the max height of Inverted Double Pendulum""" env = gym.make(f"InvertedDoublePendulum-{version}", reset_noise_scale=0) @@ -347,7 +471,7 @@ def test_inverted_double_pendulum_max_height(version): assert y == 1.2 -@pytest.mark.parametrize("version", ['v4']) +@pytest.mark.parametrize("version", ["v4"]) def test_inverted_double_pendulum_max_height_old(version): """Verify the max height of Inverted Double Pendulum (v4 does not have `reset_noise_scale` argument)""" env = gym.make(f"InvertedDoublePendulum-{version}") @@ -357,10 +481,10 @@ def test_inverted_double_pendulum_max_height_old(version): # note: fails with `brax==0.9.0` -@pytest.mark.parametrize("version", ['v5', 'v4']) +@pytest.mark.parametrize("version", ["v5", "v4"]) def test_model_object_count(version): - """Verify that all the objects of the model are loaded, mostly usefull for using non-mujoco simulator.""" - env = gym.make(f'Ant-{version}') + """Verify that all the objects of the model are loaded, mostly useful for using non-mujoco simulator.""" + env = gym.make(f"Ant-{version}") assert env.unwrapped.model.nq == 15 assert env.unwrapped.model.nv == 14 assert env.unwrapped.model.nu == 8 @@ -370,7 +494,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 14 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'HalfCheetah-{version}') + env = gym.make(f"HalfCheetah-{version}") assert env.unwrapped.model.nq == 9 assert env.unwrapped.model.nv == 9 assert env.unwrapped.model.nu == 6 @@ -380,7 +504,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 9 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'Hopper-{version}') + env = gym.make(f"Hopper-{version}") assert env.unwrapped.model.nq == 6 assert env.unwrapped.model.nv == 6 assert env.unwrapped.model.nu == 3 @@ -390,7 +514,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 5 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'Humanoid-{version}') + env = gym.make(f"Humanoid-{version}") assert env.unwrapped.model.nq == 24 assert env.unwrapped.model.nv == 23 assert env.unwrapped.model.nu == 17 @@ -400,7 +524,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 18 assert env.unwrapped.model.ntendon == 2 - env = gym.make(f'HumanoidStandup-{version}') + env = gym.make(f"HumanoidStandup-{version}") assert env.unwrapped.model.nq == 24 assert env.unwrapped.model.nv == 23 assert env.unwrapped.model.nu == 17 @@ -410,7 +534,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 18 assert env.unwrapped.model.ntendon == 2 - env = gym.make(f'InvertedDoublePendulum-{version}') + env = gym.make(f"InvertedDoublePendulum-{version}") assert env.unwrapped.model.nq == 3 assert env.unwrapped.model.nv == 3 assert env.unwrapped.model.nu == 1 @@ -420,7 +544,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 5 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'InvertedPendulum-{version}') + env = gym.make(f"InvertedPendulum-{version}") assert env.unwrapped.model.nq == 2 assert env.unwrapped.model.nv == 2 assert env.unwrapped.model.nu == 1 @@ -430,7 +554,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 3 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'Pusher-{version}') + env = gym.make(f"Pusher-{version}") assert env.unwrapped.model.nq == 11 assert env.unwrapped.model.nv == 11 assert env.unwrapped.model.nu == 7 @@ -440,7 +564,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 21 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'Reacher-{version}') + env = gym.make(f"Reacher-{version}") assert env.unwrapped.model.nq == 4 assert env.unwrapped.model.nv == 4 assert env.unwrapped.model.nu == 2 @@ -450,7 +574,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 10 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'Swimmer-{version}') + env = gym.make(f"Swimmer-{version}") assert env.unwrapped.model.nq == 5 assert env.unwrapped.model.nv == 5 assert env.unwrapped.model.nu == 2 @@ -460,7 +584,7 @@ def test_model_object_count(version): assert env.unwrapped.model.ngeom == 4 assert env.unwrapped.model.ntendon == 0 - env = gym.make(f'Walker2d-{version}') + env = gym.make(f"Walker2d-{version}") assert env.unwrapped.model.nq == 9 assert env.unwrapped.model.nv == 9 assert env.unwrapped.model.nu == 6 From 52fbae69a271675035544c3fa92eb05fc8729bc6 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 27 Jun 2023 23:17:56 +0300 Subject: [PATCH 11/53] remove old v4 doc --- gymnasium/envs/mujoco/ant_v4.py | 187 -------------- gymnasium/envs/mujoco/half_cheetah_v4.py | 121 --------- gymnasium/envs/mujoco/hopper_v4.py | 127 --------- gymnasium/envs/mujoco/humanoid_v4.py | 243 ------------------ gymnasium/envs/mujoco/humanoidstandup_v4.py | 230 ----------------- .../mujoco/inverted_double_pendulum_v4.py | 111 -------- gymnasium/envs/mujoco/inverted_pendulum_v4.py | 81 ------ gymnasium/envs/mujoco/pusher_v4.py | 127 --------- gymnasium/envs/mujoco/reacher_v4.py | 111 -------- gymnasium/envs/mujoco/swimmer_v4.py | 117 --------- gymnasium/envs/mujoco/walker2d_v4.py | 132 ---------- 11 files changed, 1587 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v4.py b/gymnasium/envs/mujoco/ant_v4.py index 4bec72528..310c444ce 100644 --- a/gymnasium/envs/mujoco/ant_v4.py +++ b/gymnasium/envs/mujoco/ant_v4.py @@ -11,193 +11,6 @@ class AntEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the environment introduced by Schulman, - Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control - Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438). - The ant is a 3D robot consisting of one torso (free rotational body) with - four legs attached to it with each leg having two body parts. The goal is to - coordinate the four legs to move in the forward (right) direction by applying - torques on the eight hinges connecting the two body parts of each leg and the torso - (nine body parts and eight hinges). - - ## Action Space - The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | - | 0 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | - | 1 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | - | 2 | Torque applied on the rotor between the torso and front left hip | -1 | 1 | hip_1 (front_left_leg) | hinge | torque (N m) | - | 3 | Torque applied on the rotor between the front left two links | -1 | 1 | angle_1 (front_left_leg) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between the torso and front right hip | -1 | 1 | hip_2 (front_right_leg) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between the front right two links | -1 | 1 | angle_2 (front_right_leg) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the torso and back left hip | -1 | 1 | hip_3 (back_leg) | hinge | torque (N m) | - | 7 | Torque applied on the rotor between the back left two links | -1 | 1 | angle_3 (back_leg) | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the ant, - followed by the velocities of those individual parts (their derivatives) with all - the positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the ant's torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (29,), float64)` where the first two observations - represent the x- and y- coordinates of the ant's torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, observation Space is a `Box(-Inf, Inf, (27,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 8 | angle between the two links on the front right | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 9 | angle between torso and first link on back left | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | - | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | - | 21 | angular velocity of angle between torso and front right link | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | - | 22 | angular velocity of the angle between front right links | -Inf | Inf | ankle_2 (front_right_leg) | hinge | angle (rad) | - | 23 | angular velocity of angle between torso and back left link | -Inf | Inf | hip_3 (back_leg) | hinge | angle (rad) | - | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | - | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | - | 26 | angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - - - If version < `v4` or `use_contact_forces` is `True` then the observation space is extended by 14*6 = 84 elements, which are contact forces - (external forces - force x, y, z and torque x, y, z) applied to the - center of mass of each of the body parts. The 14 body parts are: - - | id (for `v2`, `v3`, `v4)` | body parts | - | --- | ------------ | - | 0 | worldbody (note: forces are always full of zeros) | - | 1 | torso | - | 2 | front_left_leg | - | 3 | aux_1 (front left leg) | - | 4 | ankle_1 (front left leg) | - | 5 | front_right_leg | - | 6 | aux_2 (front right leg) | - | 7 | ankle_2 (front right leg) | - | 8 | back_leg (back left leg) | - | 9 | aux_3 (back left leg) | - | 10 | ankle_3 (back left leg) | - | 11 | right_back_leg | - | 12 | aux_4 (back right leg) | - | 13 | ankle_4 (back right leg) | - - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - - **Note:** Ant-v4 environment no longer has the following contact forces issue. - If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results - in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 - when using the Ant environment if you would like to report results with contact forces (if - contact forces are not used in your experiments, you can use version > 2.0). - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward` - - *forward_reward*: A reward of moving forward which is measured as - *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time - between actions and is dependent on the `frame_skip` parameter (default is 5), - where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*. - This reward would be positive if the ant moves forward (in positive x direction). - - *ctrl_cost*: A negative reward for penalising the ant if it takes actions - that are too large. It is measured as *`ctrl_cost_weight` * sum(action2)* - where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5. - - *contact_cost*: A negative reward for penalising the ant if the external contact - force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact - force to `contact_force_range`)2)*. - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. - - But if `use_contact_forces=True` or version < `v4` - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. - - In either case `info` will also contain the individual reward terms. - - ## Starting State - All observations start in state - (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise - with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for - stochasticity. Note that the initial z coordinate is intentionally selected - to be slightly high, thereby indicating a standing up ant. The initial orientation - is designed to make it face forward as well. - - ## Episode End - The ant is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0]) - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The ant is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Ant-v2') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Ant-v4', ctrl_cost_weight=0.1, ...) - ``` - - | Parameter | Type | Default |Description | - |-------------------------|------------|--------------|-------------------------------| - | `xml_file` | **str** | `"ant.xml"` | Path to a MuJoCo model | - | `ctrl_cost_weight` | **float** | `0.5` | Weight for *ctrl_cost* term (see section on reward) | - | `use_contact_forces` | **bool** | `False` | If true, it extends the observation space by adding contact forces (see `Observation Space` section) and includes contact_cost to the reward function (see `Rewards` section) | - | `contact_cost_weight` | **float** | `5e-4` | Weight for *contact_cost* term (see section on reward) | - | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool**| `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | - | `healthy_z_range` | **tuple** | `(0.2, 1)` | The ant is considered healthy if the z-coordinate of the torso is in this range | - | `contact_force_range` | **tuple** | `(-1, 1)` | Contact forces are clipped to this range in the computation of *contact_cost* | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3, also removed contact forces from the default observation space (new variable `use_contact_forces=True` can restore them) - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/half_cheetah_v4.py b/gymnasium/envs/mujoco/half_cheetah_v4.py index f90d59065..bc835c0ee 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v4.py +++ b/gymnasium/envs/mujoco/half_cheetah_v4.py @@ -13,127 +13,6 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the work by P. Wawrzyński in - ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf). - The HalfCheetah is a 2-dimensional robot consisting of 9 body parts and 8 - joints connecting them (including two paws). The goal is to apply a torque - on the joints to make the cheetah run forward (right) as fast as possible, - with a positive reward allocated based on the distance moved forward and a - negative reward allocated for moving backward. The torso and head of the - cheetah are fixed, and the torque can only be applied on the other 6 joints - over the front and back thighs (connecting to the torso), shins - (connecting to the thighs) and feet (connecting to the shins). - - ## Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | - | 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) | - | 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) | - | 2 | Torque applied on the back foot rotor | -1 | 1 | bfoot | hinge | torque (N m) | - | 3 | Torque applied on the front thigh rotor | -1 | 1 | fthigh | hinge | torque (N m) | - | 4 | Torque applied on the front shin rotor | -1 | 1 | fshin | hinge | torque (N m) | - | 5 | Torque applied on the front foot rotor | -1 | 1 | ffoot | hinge | torque (N m) | - - - ## Observation Space - Observations consist of positional values of different body parts of the - cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the cheetah's `rootx`. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (18,), float64)` where the first element - represents the `rootx`. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the - will be returned in `info` with key `"x_position"`. - - However, by default, the observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angle (rad) | - | 3 | angle of the second rotor | -Inf | Inf | bshin | hinge | angle (rad) | - | 4 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angle (rad) | - | 5 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angle (rad) | - | 6 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angle (rad) | - | 7 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angle (rad) | - | 8 | x-coordinate of the front tip | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | y-coordinate of the front tip | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angle of the front tip | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angle of the second rotor | -Inf | Inf | bthigh | hinge | angular velocity (rad/s) | - | 12 | angle of the second rotor | -Inf | Inf | bshin | hinge | angular velocity (rad/s) | - | 13 | velocity of the tip along the x-axis | -Inf | Inf | bfoot | hinge | angular velocity (rad/s) | - | 14 | velocity of the tip along the y-axis | -Inf | Inf | fthigh | hinge | angular velocity (rad/s) | - | 15 | angular velocity of front tip | -Inf | Inf | fshin | hinge | angular velocity (rad/s) | - | 16 | angular velocity of second rotor | -Inf | Inf | ffoot | hinge | angular velocity (rad/s) | - | excluded | x-coordinate of the front tip | -Inf | Inf | rootx | slide | position (m) | - - ## Rewards - The reward consists of two parts: - - *forward_reward*: A reward of moving forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (fixed to 5), where the frametime is 0.01 - making the - default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah - runs forward (right). - - *ctrl_cost*: A cost for penalising the cheetah if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 0.1 - - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the - initial state for stochasticity. As seen before, the first 8 values in the - state are positional and the last 9 values are velocity. A uniform noise in - the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard - normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the - initial velocity values of all zeros. - - ## Episode End - The episode truncates when the episode length is greater than 1000. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('HalfCheetah-v2') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('HalfCheetah-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"half_cheetah.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ weight (see section on reward) | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/hopper_v4.py b/gymnasium/envs/mujoco/hopper_v4.py index ef6cc0741..87c0b86c9 100644 --- a/gymnasium/envs/mujoco/hopper_v4.py +++ b/gymnasium/envs/mujoco/hopper_v4.py @@ -14,133 +14,6 @@ class HopperEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the work done by Erez, Tassa, and Todorov in - ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to - increase the number of independent state and control variables as compared to - the classic control environments. The hopper is a two-dimensional - one-legged figure that consist of four main body parts - the torso at the - top, the thigh in the middle, the leg in the bottom, and a single foot on - which the entire body rests. The goal is to make hops that move in the - forward (right) direction by applying torques on the three hinges - connecting the four body parts. - - ## Action Space - The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the - hopper, followed by the velocities of those individual parts - (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the hopper. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be `Box(-Inf, Inf, (12,), float64)` where the first observation - represents the x-coordinate of the hopper. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - will be returned in `info` with key `"x_position"`. - - However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements - correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | - | 6 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | - | 7 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 8 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 9 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 10 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | - - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`. - - *forward_reward*: A reward of hopping forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (fixed to 4), where the frametime is 0.002 - making the - default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper - hops forward (positive x direction). - - *ctrl_cost*: A cost for penalising the hopper if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise - in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ## Episode End - The hopper is said to be unhealthy if any of the following happens: - - 1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range` - 2. The height of the hopper (`observation[0]` if `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen) - 3. The angle (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The hopper is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Hopper-v2') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Hopper-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"hopper.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.001` | Weight for _ctrl_cost_ reward (see section on reward) | - | `healthy_reward` | **float** | `1` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the hopper is no longer healthy | - | `healthy_state_range` | **tuple** | `(-100, 100)` | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy | - | `healthy_z_range` | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-0.2, 0.2)` | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy | - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/humanoid_v4.py b/gymnasium/envs/mujoco/humanoid_v4.py index 5b3d9df1d..c5a1bc9b7 100644 --- a/gymnasium/envs/mujoco/humanoid_v4.py +++ b/gymnasium/envs/mujoco/humanoid_v4.py @@ -20,249 +20,6 @@ def mass_center(model, data): class HumanoidEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the environment introduced by Tassa, Erez and Todorov - in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). - The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of - legs and arms. The legs each consist of three body parts, and the arms 2 body parts (representing the knees and - elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over. - - ## Action Space - The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| - | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | - | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | - | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | - | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | - | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | - | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | - | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations - represent the x- and y-coordinates of the torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | - | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | - | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | - | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | - | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | - | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | - | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | - | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | - | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | - | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | - | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | - | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | - | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | - | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | - | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | - | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | - | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | - | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - - Additionally, after all the positional and velocity based values in the table, - the observation contains (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) - and hence adds to another 140 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence - adds another 84 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the - number of degrees of freedom (*= dim(qvel)*) - - The body parts are: - - | id (for `v2`,`v3`,`v4`) | body part | - | --- | ------------ | - | 0 | worldBody (note: all values are constant 0) | - | 1 | torso | - | 2 | lwaist | - | 3 | pelvis | - | 4 | right_thigh | - | 5 | right_sin | - | 6 | right_foot | - | 7 | left_thigh | - | 8 | left_sin | - | 9 | left_foot | - | 10 | right_upper_arm | - | 11 | right_lower_arm | - | 12 | left_upper_arm | - | 13 | left_lower_arm | - - The joints are: - - | id (for `v2`,`v3`,`v4`) | joint | - | --- | ------------ | - | 0 | root | - | 1 | root | - | 2 | root | - | 3 | root | - | 4 | root | - | 5 | root | - | 6 | abdomen_z | - | 7 | abdomen_y | - | 8 | abdomen_x | - | 9 | right_hip_x | - | 10 | right_hip_z | - | 11 | right_hip_y | - | 12 | right_knee | - | 13 | left_hip_x | - | 14 | left_hiz_z | - | 15 | left_hip_y | - | 16 | left_knee | - | 17 | right_shoulder1 | - | 18 | right_shoulder2 | - | 19 | right_elbow| - | 20 | left_shoulder1 | - | 21 | left_shoulder2 | - | 22 | left_elfbow | - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the - [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - **Note:** Humanoid-v4 environment no longer has the following contact forces issue. - If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 - results in the contact forces always being 0. As such we recommend to use a Mujoco-Py - version < 2.0 when using the Humanoid environment if you would like to report results - with contact forces (if contact forces are not used in your experiments, you can use - version > 2.0). - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward` - - *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` * - (average center of mass before action - average center of mass after action)/dt*. - *dt* is the time between actions and is dependent on the frame_skip parameter - (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*. - This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation - for the center of mass is defined in the `.py` file for the Humanoid. - - *ctrl_cost*: A negative reward for penalising the humanoid if it has too - large of a control force. If there are *nu* actuators/controls, then the control has - shape `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control2)*. - - *contact_cost*: A negative reward for penalising the humanoid if the external - contact force is too large. It is calculated by clipping - *`contact_cost_weight` * sum(external contact force2)* to the interval specified by `contact_cost_range`. - - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally - selected to be high, thereby indicating a standing up humanoid. The initial - orientation is designed to make it face forward as well. - - ## Episode End - The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the - closed interval specified by the argument `healthy_z_range`. - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 3. Termination: The humanoid is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Humanoid-v4') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Humanoid-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"humanoid.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.25` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ term (see section on reward) | - | `contact_cost_weight` | **float** | `5e-7` | Weight for _contact_cost_ term (see section on reward) | - | `healthy_reward` | **float** | `5.0` | Constant reward given if the humanoid is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | - | `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range | - | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/humanoidstandup_v4.py b/gymnasium/envs/mujoco/humanoidstandup_v4.py index 40e791c9b..fe3cc9f06 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v4.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v4.py @@ -14,236 +14,6 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is based on the environment introduced by Tassa, Erez and Todorov - in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025). - The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a - pair of legs and arms. The legs each consist of two links, and so the arms (representing the - knees and elbows respectively). The environment starts with the humanoid laying on the ground, - and then the goal of the environment is to make the humanoid standup and then keep it standing - by applying torques on the various hinges. - - ## Action Space - The agent take a 17-element vector for actions. - - The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action` - represents the numerical torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | - | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | - | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | - | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | - | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | - | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | - | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | - | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations - represent the x- and y-coordinates of the torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | - | 5 | z-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angle (rad) | - | 6 | y-angle of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angle (rad) | - | 7 | x-angle of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angle (rad) | - | 8 | x-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angle (rad) | - | 9 | z-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angle (rad) | - | 10 | y-coordinate of angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angle (rad) | - | 11 | angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angle (rad) | - | 12 | x-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angle (rad) | - | 13 | z-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angle (rad) | - | 14 | y-coordinate of angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angle (rad) | - | 15 | angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angle (rad) | - | 16 | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angle (rad) | - | 17 | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angle (rad) | - | 18 | angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angle (rad) | - | 19 | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angle (rad) | - | 20 | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angle (rad) | - | 21 | angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angle (rad) | - | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - - Additionally, after all the positional and velocity based values in the table, - the observation contains (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*) - and hence adds to another 140 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence - adds another 84 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(23,)` *(nv * 1)* and hence adds another 23 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the - number of degrees of freedom (*= dim(qvel)*) - - The body parts are: - - | id (for `v2`,`v3`,`v4`) | body part | - | --- | ------------ | - | 0 | worldBody (note: all values are constant 0) | - | 1 | torso | - | 2 | lwaist | - | 3 | pelvis | - | 4 | right_thigh | - | 5 | right_sin | - | 6 | right_foot | - | 7 | left_thigh | - | 8 | left_sin | - | 9 | left_foot | - | 10 | right_upper_arm | - | 11 | right_lower_arm | - | 12 | left_upper_arm | - | 13 | left_lower_arm | - - The joints are: - - | id (for `v2`,`v3`,`v4`) | joint | - | --- | ------------ | - | 0 | root | - | 1 | root | - | 2 | root | - | 3 | root | - | 4 | root | - | 5 | root | - | 6 | abdomen_z | - | 7 | abdomen_y | - | 8 | abdomen_x | - | 9 | right_hip_x | - | 10 | right_hip_z | - | 11 | right_hip_y | - | 12 | right_knee | - | 13 | left_hip_x | - | 14 | left_hiz_z | - | 15 | left_hip_y | - | 16 | left_knee | - | 17 | right_shoulder1 | - | 18 | right_shoulder2 | - | 19 | right_elbow| - | 20 | left_shoulder1 | - | 21 | left_shoulder2 | - | 22 | left_elfbow | - - The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the - [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - - **Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue. - If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results - in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 - when using the Humanoid environment if you would like to report results with contact forces - (if contact forces are not used in your experiments, you can use version > 2.0). - - ## Rewards - The reward consists of three parts: - - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative - reward which measures how much upward it has moved from the last timestep, but it is an - absolute reward which measures how much upward the Humanoid has moved overall. It is - measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after - action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for - one frame of movement even though the simulation has a framerate of 5 (done in order to inflate - rewards a little for faster learning). - - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of - a control force. If there are *nu* actuators/controls, then the control has shape `nu x 1`. - It is measured as *0.1 **x** sum(control2)*. - - *quad_impact_cost*: A negative reward for penalising the humanoid if the external - contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external - contact force2), 10)*. - - The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost* - - ## Starting State - All observations start in state - (0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of - [-0.01, 0.01] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally selected - to be low, thereby indicating a laying down humanoid. The initial orientation is - designed to make it face forward as well. - - ## Episode End - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: Any of the state space values is no longer finite - - ## Arguments - - No additional arguments are currently supported. - - ```python - import gymnasium as gym - env = gym.make('HumanoidStandup-v4') - ``` - - There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and - beyond take gymnasium.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc. - - ```python - import gymnasium as gym - env = gym.make('HumanoidStandup-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py index 7215683fa..2a9f9345d 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py @@ -13,117 +13,6 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment originates from control theory and builds on the cartpole - environment based on the work done by Barto, Sutton, and Anderson in - ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), - powered by the Mujoco physics simulator - allowing for more complex experiments - (such as varying the effects of gravity or constraints). This environment involves a cart that can - moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one - (leaving the second pole as the only one with one free end). The cart can be pushed left or right, - and the goal is to balance the second pole on top of the first pole, which is in turn on top of the - cart, by applying continuous forces on the cart. - - ## Action Space - The agent take a 1-element vector for actions. - The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the - numerical force applied to the cart (with magnitude representing the amount of force and - sign representing the direction) - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| - | 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) | - - ## Observation Space - - The state space consists of positional values of different body parts of the pendulum system, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ----------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | - | 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless | - | 2 | sine of the angle between the two poles | -Inf | Inf | sin(hinge2) | hinge | unitless | - | 3 | cosine of the angle between the cart and the first pole | -Inf | Inf | cos(hinge) | hinge | unitless | - | 4 | cosine of the angle between the two poles | -Inf | Inf | cos(hinge2) | hinge | unitless | - | 5 | velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | - | 6 | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | - | 7 | angular velocity of the angle between the two poles | -Inf | Inf | hinge2 | hinge | angular velocity (rad/s) | - | 8 | constraint force - 1 | -Inf | Inf | | | Force (N) | - | 9 | constraint force - 2 | -Inf | Inf | | | Force (N) | - | 10 | constraint force - 3 | -Inf | Inf | | | Force (N) | - - - There is physical contact between the robots and their environment - and Mujoco - attempts at getting realistic physics simulations for the possible physical contact - dynamics by aiming for physical accuracy and computational efficiency. - - There is one constraint force for contacts for each degree of freedom (3). - The approach and handling of constraints by Mujoco is unique to the simulator - and is based on their research. Once can find more information in their - [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html) - or in their paper - ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf). - - - ## Rewards - - The reward consists of two parts: - - *alive_bonus*: The goal is to make the second inverted pendulum stand upright - (within a certain angle limit) as long as possible - as such a reward of +10 is awarded - for each timestep that the second pole is upright. - - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum - (the only free end) moves, and it is calculated as - *0.01 * x2 + (y - 2)2*, where *x* is the x-coordinate of the tip - and *y* is the y-coordinate of the tip of the second pole. - - *velocity_penalty*: A negative reward for penalising the agent if it moves too - fast *0.001 * v12 + 0.005 * v2 2* - - The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty* - - ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range - of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard - normal force with a standard deviation of 0.1 added to the velocity values for stochasticity. - - ## Episode End - The episode ends when any of the following happens: - - 1.Truncation: The episode duration reaches 1000 timesteps. - 2.Termination: Any of the state space values is no longer finite. - 3.Termination: The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other). - - ## Arguments - - No additional arguments are currently supported. - - ```python - import gymnasium as gym - env = gym.make('InvertedDoublePendulum-v4') - ``` - There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and - beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('InvertedDoublePendulum-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v4.py b/gymnasium/envs/mujoco/inverted_pendulum_v4.py index e935398be..72f6a9c88 100644 --- a/gymnasium/envs/mujoco/inverted_pendulum_v4.py +++ b/gymnasium/envs/mujoco/inverted_pendulum_v4.py @@ -12,87 +12,6 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment is the cartpole environment based on the work done by - Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can - solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077), - just like in the classic environments but now powered by the Mujoco physics simulator - - allowing for more complex experiments (such as varying the effects of gravity). - This environment involves a cart that can moved linearly, with a pole fixed on it - at one end and having another end free. The cart can be pushed left or right, and the - goal is to balance the pole on the top of the cart by applying forces on the cart. - - ## Action Space - The agent take a 1-element vector for actions. - - The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents - the numerical force applied to the cart (with magnitude representing the amount of - force and sign representing the direction) - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| - | 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) | - - ## Observation Space - - The state space consists of positional values of different body parts of - the pendulum system, followed by the velocities of those individual parts (their derivatives) - with all the positions ordered before all the velocities. - - The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | --------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------- | - | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | - | 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) | - | 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | - | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) | - - - ## Rewards - - The goal is to make the inverted pendulum stand upright (within a certain angle limit) - as long as possible - as such a reward of +1 is awarded for each timestep that - the pole is upright. - - ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range - of [-0.01, 0.01] added to the values for stochasticity. - - ## Episode End - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches 1000 timesteps. - 2. Termination: Any of the state space values is no longer finite. - 3. Termination: The absolute value of the vertical angle between the pole and the cart is greater than 0.2 radian. - - ## Arguments - - No additional arguments are currently supported. - - ```python - import gymnasium as gym - env = gym.make('InvertedPendulum-v4') - ``` - There is no v3 for InvertedPendulum, unlike the robot environments where a - v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - ```python - import gymnasium as gym - env = gym.make('InvertedPendulum-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum) - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/pusher_v4.py b/gymnasium/envs/mujoco/pusher_v4.py index 55ba517e0..651c6325c 100644 --- a/gymnasium/envs/mujoco/pusher_v4.py +++ b/gymnasium/envs/mujoco/pusher_v4.py @@ -12,133 +12,6 @@ class PusherEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - "Pusher" is a multi-jointed robot arm which is very similar to that of a human. - The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*). - The robot consists of shoulder, elbow, forearm, and wrist joints. - - ## Action Space - The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | - | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | - | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | - | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | - | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | - | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | - | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | - - ## Observation Space - - Observations consist of - - - Angle of rotational joints on the pusher - - Angular velocities of rotational joints on the pusher - - The coordinates of the fingertip of the pusher - - The coordinates of the object to be moved - - The coordinates of the goal position - - The observation is a `Box(-Inf, Inf, (23,), float64)` where the elements correspond to the table below. - An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the - same as human joints. - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------------- | ---- | --- | -------------------------------- | -------- | ------------------------ | - | 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) | - | 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) | - | 2 | Rotation of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angle (rad) | - | 3 | Rotation of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angle (rad) | - | 4 | Rotation of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angle (rad) | - | 5 | Rotation of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angle (rad) | - | 6 | Rotation of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angle (rad) | - | 7 | Rotational velocity of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angular velocity (rad/s) | - | 8 | Rotational velocity of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angular velocity (rad/s) | - | 9 | Rotational velocity of the shoulder rolling joint | -Inf | Inf | r_upper_arm_roll_joint | hinge | angular velocity (rad/s) | - | 10 | Rotational velocity of hinge joint that flexed the elbow | -Inf | Inf | r_elbow_flex_joint | hinge | angular velocity (rad/s) | - | 11 | Rotational velocity of hinge that rolls the forearm | -Inf | Inf | r_forearm_roll_joint | hinge | angular velocity (rad/s) | - | 12 | Rotational velocity of flexing the wrist | -Inf | Inf | r_wrist_flex_joint | hinge | angular velocity (rad/s) | - | 13 | Rotational velocity of rolling the wrist | -Inf | Inf | r_wrist_roll_joint | hinge | angular velocity (rad/s) | - | 14 | x-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 15 | y-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 16 | z-coordinate of the fingertip of the pusher | -Inf | Inf | tips_arm | slide | position (m) | - | 17 | x-coordinate of the object to be moved | -Inf | Inf | object (obj_slidex) | slide | position (m) | - | 18 | y-coordinate of the object to be moved | -Inf | Inf | object (obj_slidey) | slide | position (m) | - | 19 | z-coordinate of the object to be moved | -Inf | Inf | object | cylinder | position (m) | - | 20 | x-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidex) | slide | position (m) | - | 21 | y-coordinate of the goal position of the object | -Inf | Inf | goal (goal_slidey) | slide | position (m) | - | 22 | z-coordinate of the goal position of the object | -Inf | Inf | goal | sphere | position (m) | - - - ## Rewards - The reward consists of two parts: - - *reward_near *: This reward is a measure of how far the *fingertip* - of the pusher (the unattached end) is from the object, with a more negative - value assigned for when the pusher's *fingertip* is further away from the - target. It is calculated as the negative vector norm of (position of - the fingertip - position of target), or *-norm("fingertip" - "target")*. - - *reward_dist *: This reward is a measure of how far the object is from - the target goal position, with a more negative value assigned for object is - further away from the target. It is calculated as the negative vector norm of - (position of the object - position of goal), or *-norm("object" - "target")*. - - *reward_control*: A negative reward for penalising the pusher if - it takes actions that are too large. It is measured as the negative squared - Euclidean norm of the action, i.e. as *- sum(action2)*. - - The total reward returned is ***reward*** *=* *reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near* - - Unlike other environments, Pusher does not allow you to specify weights for the individual reward terms. - However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, - you should create a wrapper that computes the weighted reward from `info`. - - - ## Starting State - All pusher (not including object and goal) states start in - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range - [-0.005, 0.005] is added to the velocity attributes only. The velocities of - the object and goal are permanently set to 0. The object's x-position is selected uniformly - between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this - process is repeated until the vector norm between the object's (x,y) position and origin is not greater - than 0.17. The goal always have the same position of (0.45, -0.05, -0.323). - - The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05* - - ## Episode End - - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 100 timesteps. - 2. Termination: Any of the state space values is no longer finite. - - ## Arguments - - No additional arguments are currently supported (in v2 and lower), - but modifications can be made to the XML file in the assets folder - (or by changing the path to a modified XML file in another folder).. - - ```python - import gymnasium as gym - env = gym.make('Pusher-v4') - ``` - - There is no v3 for Pusher, unlike the robot environments where a v3 and - beyond take `gymnasmium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Pusher-v2') - ``` - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/reacher_v4.py b/gymnasium/envs/mujoco/reacher_v4.py index 49c6d4892..9bc233bc3 100644 --- a/gymnasium/envs/mujoco/reacher_v4.py +++ b/gymnasium/envs/mujoco/reacher_v4.py @@ -9,117 +9,6 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a - target that is spawned at a random position. - - ## Action Space - The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------| - | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | - | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | - - ## Observation Space - Observations consist of - - - The cosine of the angles of the two arms - - The sine of the angles of the two arms - - The coordinates of the target - - The angular velocities of the arms - - The vector between the target and the reacher's fingertip (3 dimensional with the last element being 0) - - The observation is a `Box(-Inf, Inf, (11,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ---------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | - | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | - | 2 | sine of the angle of the first arm | -Inf | Inf | sin(joint0) | hinge | unitless | - | 3 | sine of the angle of the second arm | -Inf | Inf | sin(joint1) | hinge | unitless | - | 4 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | - | 5 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | - | 6 | angular velocity of the first arm | -Inf | Inf | joint0 | hinge | angular velocity (rad/s) | - | 7 | angular velocity of the second arm | -Inf | Inf | joint1 | hinge | angular velocity (rad/s) | - | 8 | x-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | - | 9 | y-value of position_fingertip - position_target | -Inf | Inf | NA | slide | position (m) | - | 10 | z-value of position_fingertip - position_target (constantly 0 since reacher is 2d and z is same for both) | -Inf | Inf | NA | slide | position (m) | - - - Most Gym environments just return the positions and velocity of the - joints in the `.xml` file as the state of the environment. However, in - reacher the state is created by combining only certain elements of the - position and velocity, and performing some function transformations on them. - If one is to read the `.xml` for reacher then they will find 4 joints: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - |-----|-----------------------------|----------|----------|----------------------------------|-------|--------------------| - | 0 | angle of the first arm | -Inf | Inf | joint0 | hinge | angle (rad) | - | 1 | angle of the second arm | -Inf | Inf | joint1 | hinge | angle (rad) | - | 2 | x-coordinate of the target | -Inf | Inf | target_x | slide | position (m) | - | 3 | y-coordinate of the target | -Inf | Inf | target_y | slide | position (m) | - - - ## Rewards - The reward consists of two parts: - - *reward_distance*: This reward is a measure of how far the *fingertip* - of the reacher (the unattached end) is from the target, with a more negative - value assigned for when the reacher's *fingertip* is further away from the - target. It is calculated as the negative vector norm of (position of - the fingertip - position of target), or *-norm("fingertip" - "target")*. - - *reward_control*: A negative reward for penalising the walker if - it takes actions that are too large. It is measured as the negative squared - Euclidean norm of the action, i.e. as *- sum(action2)*. - - The total reward returned is ***reward*** *=* *reward_distance + reward_control* - - Unlike other environments, Reacher does not allow you to specify weights for the individual reward terms. - However, `info` does contain the keys *reward_dist* and *reward_ctrl*. Thus, if you'd like to weight the terms, - you should create a wrapper that computes the weighted reward from `info`. - - - ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a noise added for stochasticity. A uniform noise in the range - [-0.1, 0.1] is added to the positional attributes, while the target position - is selected uniformly at random in a disk of radius 0.2 around the origin. - Independent, uniform noise in the - range of [-0.005, 0.005] is added to the velocities, and the last - element ("fingertip" - "target") is calculated at the end once everything - is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02* - - ## Episode End - - The episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 50 timesteps (with a new random target popping up if the reacher's fingertip reaches it before 50 timesteps) - 2. Termination: Any of the state space values is no longer finite. - - ## Arguments - - No additional arguments are currently supported (in v2 and lower), - but modifications can be made to the XML file in the assets folder - (or by changing the path to a modified XML file in another folder).. - - ```python - import gymnasium as gym - env = gym.make('Reacher-v4') - ``` - - There is no v3 for Reacher, unlike the robot environments where a v3 and - beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks (not including reacher, which has a max_time_steps of 50). Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/swimmer_v4.py b/gymnasium/envs/mujoco/swimmer_v4.py index 6db5f4f08..3f597b86e 100644 --- a/gymnasium/envs/mujoco/swimmer_v4.py +++ b/gymnasium/envs/mujoco/swimmer_v4.py @@ -8,123 +8,6 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment corresponds to the Swimmer environment described in Rémi Coulom's PhD thesis - ["Reinforcement Learning Using Neural Networks, with Applications to Motor Control"](https://tel.archives-ouvertes.fr/tel-00003985/document). - The environment aims to increase the number of independent state and control - variables as compared to the classic control environments. The swimmers - consist of three or more segments ('***links***') and one less articulation - joints ('***rotors***') - one rotor joint connecting exactly two links to - form a linear chain. The swimmer is suspended in a two dimensional pool and - always starts in the same position (subject to some deviation drawn from an - uniform distribution), and the goal is to move as fast as possible towards - the right by applying torque on the rotors and using the fluids friction. - - ## Notes - - The problem parameters are: - Problem parameters: - * *n*: number of body parts - * *mi*: mass of part *i* (*i* ∈ {1...n}) - * *li*: length of part *i* (*i* ∈ {1...n}) - * *k*: viscous-friction coefficient - - While the default environment has *n* = 3, *li* = 0.1, - and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the - number of links, or to tweak any of the parameters. - - ## Action Space - The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links* - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the first rotor | -1 | 1 | motor1_rot | hinge | torque (N m) | - | 1 | Torque applied on the second rotor | -1 | 1 | motor2_rot | hinge | torque (N m) | - - ## Observation Space - By default, observations consists of: - * θi: angle of part *i* with respect to the *x* axis - * θi': its derivative with respect to time (angular velocity) - - In the default case, observations do not include the x- and y-coordinates of the front tip. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - Then, the observation space will be `Box(-Inf, Inf, (10,), float64)` where the first two observations - represent the x- and y-coordinates of the front tip. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - By default, the observation is a `Box(-Inf, Inf, (8,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | - | 0 | angle of the front tip | -Inf | Inf | free_body_rot | hinge | angle (rad) | - | 1 | angle of the first rotor | -Inf | Inf | motor1_rot | hinge | angle (rad) | - | 2 | angle of the second rotor | -Inf | Inf | motor2_rot | hinge | angle (rad) | - | 3 | velocity of the tip along the x-axis | -Inf | Inf | slider1 | slide | velocity (m/s) | - | 4 | velocity of the tip along the y-axis | -Inf | Inf | slider2 | slide | velocity (m/s) | - | 5 | angular velocity of front tip | -Inf | Inf | free_body_rot | hinge | angular velocity (rad/s) | - | 6 | angular velocity of first rotor | -Inf | Inf | motor1_rot | hinge | angular velocity (rad/s) | - | 7 | angular velocity of second rotor | -Inf | Inf | motor2_rot | hinge | angular velocity (rad/s) | - | excluded | position of the tip along the x-axis | -Inf | Inf | slider1 | slide | position (m) | - | excluded | position of the tip along the y-axis | -Inf | Inf | slider2 | slide | position (m) | - - ## Rewards - The reward consists of two parts: - - *forward_reward*: A reward of moving forward which is measured - as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is - the time between actions and is dependent on the frame_skip parameter - (default is 4), where the frametime is 0.01 - making the - default *dt = 4 * 0.01 = 0.04*. This reward would be positive if the swimmer - swims right as desired. - - *ctrl_cost*: A cost for penalising the swimmer if it takes - actions that are too large. It is measured as *`ctrl_cost_weight` * - sum(action2)* where *`ctrl_cost_weight`* is a parameter set for the - control and has a default value of 1e-4 - - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity. - - ## Episode End - The episode truncates when the episode length is greater than 1000. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - gym.make('Swimmer-v4') - ``` - - v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Swimmer-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"swimmer.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-4` | Weight for _ctrl_cost_ term (see section on reward) | - | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", diff --git a/gymnasium/envs/mujoco/walker2d_v4.py b/gymnasium/envs/mujoco/walker2d_v4.py index 204f17644..fade5de41 100644 --- a/gymnasium/envs/mujoco/walker2d_v4.py +++ b/gymnasium/envs/mujoco/walker2d_v4.py @@ -14,138 +14,6 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): - """ - ## Description - - This environment builds on the [hopper](https://gymnasium.farama.org/environments/mujoco/hopper/) environment - by adding another set of legs making it possible for the robot to walk forward instead of - hop. Like other Mujoco environments, this environment aims to increase the number of independent state - and control variables as compared to the classic control environments. The walker is a - two-dimensional two-legged figure that consist of seven main body parts - a single torso at the top - (with the two legs splitting after the torso), two thighs in the middle below the torso, two legs - in the bottom below the thighs, and two feet attached to the legs on which the entire body rests. - The goal is to walk in the in the forward (right) - direction by applying torques on the six hinges connecting the seven body parts. - - ## Action Space - The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | - | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | - | 2 | Torque applied on the foot rotor | -1 | 1 | foot_joint | hinge | torque (N m) | - | 3 | Torque applied on the left thigh rotor | -1 | 1 | thigh_left_joint | hinge | torque (N m) | - | 4 | Torque applied on the left leg rotor | -1 | 1 | leg_left_joint | hinge | torque (N m) | - | 5 | Torque applied on the left foot rotor | -1 | 1 | foot_left_joint | hinge | torque (N m) | - - ## Observation Space - Observations consist of positional values of different body parts of the walker, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. - - By default, observations do not include the x-coordinate of the torso. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be `Box(-Inf, Inf, (18,), float64)` where the first observation - represent the x-coordinates of the torso of the walker. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - of the torso will be returned in `info` with key `"x_position"`. - - By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: - - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | - | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | - | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | - | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | - | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | - | 2 | angle of the thigh joint | -Inf | Inf | thigh_joint | hinge | angle (rad) | - | 3 | angle of the leg joint | -Inf | Inf | leg_joint | hinge | angle (rad) | - | 4 | angle of the foot joint | -Inf | Inf | foot_joint | hinge | angle (rad) | - | 5 | angle of the left thigh joint | -Inf | Inf | thigh_left_joint | hinge | angle (rad) | - | 6 | angle of the left leg joint | -Inf | Inf | leg_left_joint | hinge | angle (rad) | - | 7 | angle of the left foot joint | -Inf | Inf | foot_left_joint | hinge | angle (rad) | - | 8 | velocity of the x-coordinate of the torso | -Inf | Inf | rootx | slide | velocity (m/s) | - | 9 | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz | slide | velocity (m/s) | - | 10 | angular velocity of the angle of the torso | -Inf | Inf | rooty | hinge | angular velocity (rad/s) | - | 11 | angular velocity of the thigh hinge | -Inf | Inf | thigh_joint | hinge | angular velocity (rad/s) | - | 12 | angular velocity of the leg hinge | -Inf | Inf | leg_joint | hinge | angular velocity (rad/s) | - | 13 | angular velocity of the foot hinge | -Inf | Inf | foot_joint | hinge | angular velocity (rad/s) | - | 14 | angular velocity of the thigh hinge | -Inf | Inf | thigh_left_joint | hinge | angular velocity (rad/s) | - | 15 | angular velocity of the leg hinge | -Inf | Inf | leg_left_joint | hinge | angular velocity (rad/s) | - | 16 | angular velocity of the foot hinge | -Inf | Inf | foot_left_joint | hinge | angular velocity (rad/s) | - - ## Rewards - The reward consists of three parts: - - *healthy_reward*: Every timestep that the walker is alive, it receives a fixed reward of value `healthy_reward`, - - *forward_reward*: A reward of walking forward which is measured as - *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. - *dt* is the time between actions and is dependeent on the frame_skip parameter - (default is 4), where the frametime is 0.002 - making the default - *dt = 4 * 0.002 = 0.008*. This reward would be positive if the walker walks forward (positive x direction). - - *ctrl_cost*: A cost for penalising the walker if it - takes actions that are too large. It is measured as - *`ctrl_cost_weight` * sum(action2)* where *`ctrl_cost_weight`* is - a parameter set for the control and has a default value of 0.001 - - The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms - - ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. - - ## Episode End - The walker is said to be unhealthy if any of the following happens: - - 1. Any of the state space values is no longer finite - 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` - 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` - - If `terminate_when_unhealthy=True` is passed during construction (which is the default), - the episode ends when any of the following happens: - - 1. Truncation: The episode duration reaches a 1000 timesteps - 2. Termination: The walker is unhealthy - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. - - ## Arguments - - No additional arguments are currently supported in v2 and lower. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4') - ``` - - v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. - - ```python - import gymnasium as gym - env = gym.make('Walker2d-v4', ctrl_cost_weight=0.1, ....) - ``` - - | Parameter | Type | Default | Description | - | -------------------------------------------- | --------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"walker2d.xml"` | Path to a MuJoCo model | - | `forward_reward_weight` | **float** | `1.0` | Weight for _forward_reward_ term (see section on reward) | - | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctr_cost_ term (see section on reward) | - | `healthy_reward` | **float** | `1.0` | Constant reward given if the ant is "healthy" after timestep | - | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the walker is no longer healthy | - | `healthy_z_range` | **tuple** | `(0.8, 2)` | The z-coordinate of the torso of the walker must be in this range to be considered healthy | - | `healthy_angle_range` | **tuple** | `(-1, 1)` | The angle must be in this range to be considered healthy | - | `reset_noise_scale` | **float** | `5e-3` | Scale of random perturbations of initial position and velocity (see section on Starting State) | - | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | - - - ## Version History - - * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 - * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) - * v2: All continuous control environments now use mujoco-py >= 1.50 - * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments. - * v0: Initial versions release (1.0.0) - """ - metadata = { "render_modes": [ "human", From 25fe2c34489f98a8dbf21debf556c76b9fdd67df Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 27 Jun 2023 23:39:01 +0300 Subject: [PATCH 12/53] fix `test_make_erros()` --- tests/envs/registration/test_make.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/envs/registration/test_make.py b/tests/envs/registration/test_make.py index 0262c8831..2d54f2ed6 100644 --- a/tests/envs/registration/test_make.py +++ b/tests/envs/registration/test_make.py @@ -500,7 +500,7 @@ def test_make_errors(): with pytest.raises( gym.error.Error, match=re.escape( - "Environment version v0 for `Humanoid` is deprecated. Please use `Humanoid-v4` instead." + "Environment version v0 for `Humanoid` is deprecated. Please use `Humanoid-v5` instead." ), ): gym.make("Humanoid-v0") From 43c25c6799712f69602795b912ce1d0c566e3fb9 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Wed, 28 Jun 2023 01:29:08 +0300 Subject: [PATCH 13/53] cleanup tests --- tests/envs/mujoco/test_mujoco_v5.py | 213 ++++++++++++++-------------- 1 file changed, 103 insertions(+), 110 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index 10a1df169..61e8420ef 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -58,7 +58,15 @@ def test_verify_info_x_position(env_id): # Note: "HumnanoidStandup-v4" does not have `info` # Note: "Humnanoid-v4/3" & "Ant-v4/3" fail this test @pytest.mark.parametrize( - "env_id", ["Ant-v5", "Humanoid-v5", "HumanoidStandup-v5", "Swimmer-v5"] + "env_id", + [ + "Ant-v5", + "Humanoid-v5", + "HumanoidStandup-v5", + "Swimmer-v5", + "Swimmer-v4", + "Swimmer-v3", + ], ) def test_verify_info_y_position(env_id): """Asserts that the environment has position[1] == info['y_position']""" @@ -146,19 +154,20 @@ def mass_center(model, data): # Note: Inverted(Double)Pendulum-v4/2 does not have `info['reward_survive']`, but it is still affected # Note: all `v4/v3/v2` environments with a heathly reward are fail this test @pytest.mark.parametrize( - "env_id", + "env", [ - "Ant-v5", - "Hopper-v5", - "Humanoid-v5", - "InvertedDoublePendulum-v5", - "InvertedPendulum-v5", - "Walker2d-v5", + "Ant", + "Hopper", + "Humanoid", + "InvertedDoublePendulum", + "InvertedPendulum", + "Walker2d", ], ) -def test_verify_reward_survive(env_id): +@pytest.mark.parametrize("version", ["v5"]) +def test_verify_reward_survive(env, version): """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states""" - env = gym.make(env_id, reset_noise_scale=0) + env = gym.make(f"{env}-{version}", reset_noise_scale=0) env.reset(seed=0) env.action_space.seed(0) @@ -206,96 +215,75 @@ def test_frame_skip(env, version, frame_skip): # Dev Note: This can be version env parametrized because each env has it's own reward function @pytest.mark.parametrize("version", ["v5"]) def test_reward_sum(version): - """Assert that the total reward equals the sum of the individual reward terms.""" + """Assert that the total reward equals the sum of the individual reward terms, also asserts that the reward function has no fp ordering arithmetic errors.""" + NUM_STEPS = 100 env = gym.make(f"Ant-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert ( - reward - - info["reward_forward"] - - info["reward_ctrl"] - - info["reward_contact"] - - info["reward_survive"] - < 1e-14 - ) + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == (info["reward_survive"] + info["reward_forward"]) - ( + -info["reward_ctrl"] + -info["reward_contact"] + ) env = gym.make(f"HalfCheetah-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_forward"] - info["reward_ctrl"] < 1e-14 + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_forward"] + info["reward_ctrl"] env = gym.make(f"Hopper-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert ( - reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] - < 1e-14 - ) + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] env = gym.make(f"Humanoid-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert ( - reward - - info["reward_forward"] - - info["reward_ctrl"] - - info["reward_contact"] - - info["reward_survive"] - < 1e-14 - ) + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == (info["reward_forward"] + info["reward_survive"]) + (info["reward_ctrl"] + info["reward_contact"]) env = gym.make(f"HumanoidStandup-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert ( - reward - - info["reward_linup"] - - info["reward_quadctrl"] - - info["reward_impact"] - - 1 - < 1e-14 - ) + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_linup"] + info["reward_quadctrl"] + info["reward_impact"] + 1 env = gym.make(f"InvertedDoublePendulum-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert ( - reward - - info["reward_survive"] - - info["distance_penalty"] - - info["velocity_penalty"] - < 1e-14 - ) + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_survive"] + info["distance_penalty"] + info["velocity_penalty"] env = gym.make(f"InvertedPendulum-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward == info["reward_survive"] + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_survive"] env = gym.make(f"Pusher-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert ( - reward - info["reward_dist"] - info["reward_ctrl"] - info["reward_near"] < 1e-14 - ) + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_dist"] + info["reward_ctrl"] + info["reward_near"] env = gym.make(f"Reacher-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_dist"] - info["reward_ctrl"] < 1e-14 + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_dist"] + info["reward_ctrl"] env = gym.make(f"Swimmer-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward - info["reward_forward"] - info["reward_ctrl"] < 1e-14 + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_forward"] + info["reward_ctrl"] env = gym.make(f"Walker2d-{version}") env.reset() - _, reward, _, _, info = env.step(env.action_space.sample()) - assert ( - reward - info["reward_forward"] - info["reward_ctrl"] - info["reward_survive"] - < 1e-14 - ) + for _ in range(NUM_STEPS): + _, reward, _, _, info = env.step(env.action_space.sample()) + assert reward == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] # Note: the environtments that are not present, is because they do not have identical behaviour @@ -304,21 +292,23 @@ def test_reward_sum(version): ) def test_identical_behaviour_v45(env): """Verify that v4 -> v5 transition. does not change the behaviour of the environments in way way""" + NUM_STEPS = 100 + env_v4 = gym.make(env + "-v4") env_v5 = gym.make(env + "-v5") env_v4.reset(seed=1234) env_v5.reset(seed=1234) - action = env_v4.action_space.sample() - obs_v4, rew_v4, terminal_v4, truncated_v4, info_v4 = env_v4.step(action) - obs_v5, rew_v5, terminal_v5, truncated_v5, info_v5 = env_v5.step(action) - assert obs_v4.shape[0] != 1 - assert obs_v5.shape[0] != 1 - assert (env_v4.unwrapped.data.qpos == env_v5.unwrapped.data.qpos).all() - assert (env_v4.unwrapped.data.qvel == env_v5.unwrapped.data.qvel).all() - if env not in ["HumanoidStandup", "Reacher"]: # they have different obs - assert (obs_v4 == obs_v5).all() - assert rew_v4 == rew_v5 - assert terminal_v4 == terminal_v5 and truncated_v4 == truncated_v5 + for _ in range(NUM_STEPS): + action = env_v4.action_space.sample() + obs_v4, rew_v4, terminal_v4, truncated_v4, info_v4 = env_v4.step(action) + obs_v5, rew_v5, terminal_v5, truncated_v5, info_v5 = env_v5.step(action) + assert obs_v4.shape[0] != 1 and obs_v5.shape[0] != 1 + assert (env_v4.unwrapped.data.qpos == env_v5.unwrapped.data.qpos).all() + assert (env_v4.unwrapped.data.qvel == env_v5.unwrapped.data.qvel).all() + if env not in ["HumanoidStandup", "Reacher"]: # they have different obs + assert (obs_v4 == obs_v5).all() + assert rew_v4 == rew_v5 + assert terminal_v4 == terminal_v5 and truncated_v4 == truncated_v5 @pytest.mark.parametrize("version", ["v5", "v4"]) @@ -377,10 +367,11 @@ def test_distance_from_origin_info(env_id): ) -@pytest.mark.parametrize("env_id", ["Hopper-v5", "HumanoidStandup-v5", "Walker2d-v5"]) -def test_z_distance_from_origin_info(env_id): +@pytest.mark.parametrize("env", ["Hopper", "HumanoidStandup", "Walker2d"]) +@pytest.mark.parametrize("version", ["v5"]) +def test_z_distance_from_origin_info(env, version): """Verify that `info"z_distance_from_origin"` is correct""" - env = gym.make(env_id) + env = gym.make(f"{env}-{version}") env.reset() _, _, _, _, info = env.step(env.action_space.sample()) mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) @@ -392,21 +383,22 @@ def test_z_distance_from_origin_info(env_id): @pytest.mark.parametrize( - "env_id", + "env", [ - "Ant-v5", - "HalfCheetah-v5", - "Hopper-v5", - "Humanoid-v5", - "HumanoidStandup-v5", - "InvertedPendulum-v5", - "Swimmer-v5", - "Walker2d-v5", + "Ant", + "HalfCheetah", + "Hopper", + "Humanoid", + "HumanoidStandup", + "InvertedPendulum", + "Swimmer", + "Walker2d", ], ) -def test_observation_structure(env_id): +@pytest.mark.parametrize("version", ["v5"]) +def test_observation_structure(env, version): """Verify that the `env.observation_structure` is properly defined.""" - env = gym.make(env_id) + env = gym.make(f"{env}-{version}") if hasattr(env, "observation_structure"): return @@ -416,37 +408,38 @@ def test_observation_structure(env_id): env.unwrapped.model.nq == obs_struct.get("skipped_qpos", 0) + obs_struct["qpos"] ) assert env.unwrapped.model.nv == obs_struct["qvel"] - if obs_struct.get("cinert", 0): + if obs_struct.get("cinert", False): assert (env.unwrapped.model.nbody - 1) * 10 == obs_struct["cinert"] - if obs_struct.get("cvel", 0): + if obs_struct.get("cvel", False): assert (env.unwrapped.model.nbody - 1) * 6 == obs_struct["cvel"] - if obs_struct.get("qfrc_actuator", 0): + if obs_struct.get("qfrc_actuator", False): assert env.unwrapped.model.nv - 6 == obs_struct["qfrc_actuator"] - if obs_struct.get("cfrc_ext", 0): + if obs_struct.get("cfrc_ext", False): assert (env.unwrapped.model.nbody - 1) * 6 == obs_struct["cfrc_ext"] - if obs_struct.get("ten_lenght", 0): + if obs_struct.get("ten_lenght", False): assert env.unwrapped.model.ntendon == obs_struct["ten_lenght"] - if obs_struct.get("ten_velocity", 0): + if obs_struct.get("ten_velocity", False): assert env.unwrapped.model.ntendon == obs_struct["ten_velocity"] @pytest.mark.parametrize( - "env_id", + "env", [ - "Ant-v5", - "HalfCheetah-v5", - "Hopper-v5", - "Humanoid-v5", - "HumanoidStandup-v5", - "Swimmer-v5", - "Walker2d-v5", + "Ant", + "HalfCheetah", + "Hopper", + "Humanoid", + "HumanoidStandup", + "Swimmer", + "Walker2d", ], ) -def test_reset_info(env_id): +@pytest.mark.parametrize("version", ["v5"]) +def test_reset_info(env, version): """Verify that the environment returns info at `reset()`""" - env = gym.make(env_id) + env = gym.make(f"{env}-{version}") _, reset_info = env.reset() - assert reset_info.get("x_position") + assert "x_position" in reset_info """ From 2602a047dd27afbd45a71d0234a680ea742cdb4c Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Wed, 28 Jun 2023 02:08:38 +0300 Subject: [PATCH 14/53] ant cleanup --- gymnasium/envs/mujoco/ant_v5.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 38b00d503..978fd62c5 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -363,9 +363,9 @@ def step(self, action): rewards = forward_reward + healthy_reward - costs = ctrl_cost = self.control_cost(action) + ctrl_cost = self.control_cost(action) contact_cost = self.contact_cost - costs += contact_cost + costs = ctrl_cost + contact_cost terminated = self.terminated observation = self._get_obs() From 79c8fa9dc7f917f060f8613ba265673fa3db2371 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Wed, 28 Jun 2023 15:39:52 +0300 Subject: [PATCH 15/53] fix mjc-py error --- gymnasium/envs/mujoco/mujoco_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gymnasium/envs/mujoco/mujoco_env.py b/gymnasium/envs/mujoco/mujoco_env.py index c3490bd1f..f1759da3d 100644 --- a/gymnasium/envs/mujoco/mujoco_env.py +++ b/gymnasium/envs/mujoco/mujoco_env.py @@ -209,7 +209,7 @@ def __init__( logger.deprecation( "This version of the mujoco environments depends " "on the mujoco-py bindings, which are no longer maintained " - "and may stop working. Please upgrade to the v4 versions of " + "and may stop working. Please upgrade to the v5 or v4 versions of " "the environments (which depend on the mujoco python bindings instead), unless " "you are trying to precisely replicate previous works)." ) From a8af4f5eaa07f328d18dd75e1822295c4d34b784 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Thu, 29 Jun 2023 20:36:00 +0300 Subject: [PATCH 16/53] update tests --- tests/envs/mujoco/test_mujoco_v5.py | 277 ++++++++++++++++++++++------ 1 file changed, 218 insertions(+), 59 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index 61e8420ef..bc2e02092 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -6,6 +6,7 @@ import gymnasium as gym from gymnasium.error import Error +from gymnasium.utils.env_checker import data_equivalence ALL_MUJOCO_ENVS = [ @@ -45,8 +46,8 @@ "Walker2d-v3", ], ) -def test_verify_info_x_position(env_id): - """Asserts that the environment has position[0] == info['x_position']""" +def test_verify_info_x_position(env_id: str): + """Asserts that the environment has position[0] == info['x_position'].""" env = gym.make(env_id, exclude_current_positions_from_observation=False) _, _ = env.reset() @@ -68,8 +69,8 @@ def test_verify_info_x_position(env_id): "Swimmer-v3", ], ) -def test_verify_info_y_position(env_id): - """Asserts that the environment has position[1] == info['y_position']""" +def test_verify_info_y_position(env_id: str): + """Asserts that the environment has position[1] == info['y_position'].""" env = gym.make(env_id, exclude_current_positions_from_observation=False) _, _ = env.reset() @@ -81,8 +82,8 @@ def test_verify_info_y_position(env_id): # Note: "HumnanoidStandup-v4" does not have `info` @pytest.mark.parametrize("env", ["HalfCheetah", "Hopper", "Swimmer", "Walker2d"]) @pytest.mark.parametrize("version", ["v5", "v4", "v3"]) -def test_verify_info_x_velocity(env, version): - """Asserts that the environment `info['x_velocity']` is properly assigned""" +def test_verify_info_x_velocity(env: str, version: str): + """Asserts that the environment `info['x_velocity']` is properly assigned.""" env = gym.make(f"{env}-{version}") env.reset() @@ -97,8 +98,8 @@ def test_verify_info_x_velocity(env, version): # Note: "HumnanoidStandup-v4" does not have `info` @pytest.mark.parametrize("env_id", ["Swimmer-v5", "Swimmer-v4", "Swimmer-v3"]) -def test_verify_info_y_velocity(env_id): - """Asserts that the environment `info['y_velocity']` is properly assigned""" +def test_verify_info_y_velocity(env_id: str): + """Asserts that the environment `info['y_velocity']` is properly assigned.""" env = gym.make(env_id) env.reset() @@ -112,8 +113,8 @@ def test_verify_info_y_velocity(env_id): @pytest.mark.parametrize("env_id", ["Ant-v5", "Ant-v4", "Ant-v3"]) -def test_verify_info_xy_velocity_xpos(env_id): - """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the ant environment which uses kinmatics for the velocity""" +def test_verify_info_xy_velocity_xpos(env_id: str): + """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the ant environment which uses kinmatics for the velocity.""" env = gym.make(env_id) env.reset() @@ -128,8 +129,8 @@ def test_verify_info_xy_velocity_xpos(env_id): @pytest.mark.parametrize("env_id", ["Humanoid-v5", "Humanoid-v4", "Humanoid-v3"]) -def test_verify_info_xy_velocity_com(env_id): - """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the humanoid environment which uses kinmatics of Center Of Mass for the velocity""" +def test_verify_info_xy_velocity_com(env_id: str): + """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the humanoid environment which uses kinmatics of Center Of Mass for the velocity.""" def mass_center(model, data): mass = np.expand_dims(model.body_mass, axis=1) @@ -165,8 +166,8 @@ def mass_center(model, data): ], ) @pytest.mark.parametrize("version", ["v5"]) -def test_verify_reward_survive(env, version): - """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states""" +def test_verify_reward_survive(env: str, version: str): + """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states.""" env = gym.make(f"{env}-{version}", reset_noise_scale=0) env.reset(seed=0) env.action_space.seed(0) @@ -198,8 +199,8 @@ def test_verify_reward_survive(env, version): @pytest.mark.parametrize("env", ALL_MUJOCO_ENVS) @pytest.mark.parametrize("version", ["v5"]) @pytest.mark.parametrize("frame_skip", [1, 2, 3, 4, 5]) -def test_frame_skip(env, version, frame_skip): - """Verify that all `mujoco` envs work with different `frame_skip` values""" +def test_frame_skip(env: str, version: str, frame_skip: int): + """Verify that all `mujoco` envs work with different `frame_skip` values.""" env_id = f"{env}-{version}" env = gym.make(env_id, frame_skip=frame_skip) @@ -214,7 +215,7 @@ def test_frame_skip(env, version, frame_skip): # Dev Note: This can be version env parametrized because each env has it's own reward function @pytest.mark.parametrize("version", ["v5"]) -def test_reward_sum(version): +def test_reward_sum(version: str): """Assert that the total reward equals the sum of the individual reward terms, also asserts that the reward function has no fp ordering arithmetic errors.""" NUM_STEPS = 100 env = gym.make(f"Ant-{version}") @@ -235,25 +236,41 @@ def test_reward_sum(version): env.reset() for _ in range(NUM_STEPS): _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] + assert ( + reward + == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] + ) env = gym.make(f"Humanoid-{version}") env.reset() for _ in range(NUM_STEPS): _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward == (info["reward_forward"] + info["reward_survive"]) + (info["reward_ctrl"] + info["reward_contact"]) + assert reward == (info["reward_forward"] + info["reward_survive"]) + ( + info["reward_ctrl"] + info["reward_contact"] + ) env = gym.make(f"HumanoidStandup-{version}") env.reset() for _ in range(NUM_STEPS): _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward == info["reward_linup"] + info["reward_quadctrl"] + info["reward_impact"] + 1 + assert ( + reward + == info["reward_linup"] + + info["reward_quadctrl"] + + info["reward_impact"] + + 1 + ) env = gym.make(f"InvertedDoublePendulum-{version}") env.reset() for _ in range(NUM_STEPS): _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward == info["reward_survive"] + info["distance_penalty"] + info["velocity_penalty"] + assert ( + reward + == info["reward_survive"] + + info["distance_penalty"] + + info["velocity_penalty"] + ) env = gym.make(f"InvertedPendulum-{version}") env.reset() @@ -283,37 +300,76 @@ def test_reward_sum(version): env.reset() for _ in range(NUM_STEPS): _, reward, _, _, info = env.step(env.action_space.sample()) - assert reward == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] + assert ( + reward + == info["reward_forward"] + info["reward_survive"] + info["reward_ctrl"] + ) # Note: the environtments that are not present, is because they do not have identical behaviour -@pytest.mark.parametrize( - "env", ["HalfCheetah", "HumanoidStandup", "Pusher", "Reacher", "Swimmer"] -) -def test_identical_behaviour_v45(env): - """Verify that v4 -> v5 transition. does not change the behaviour of the environments in way way""" +def test_identical_behaviour_v45(): + """Verify that v4 -> v5 transition. Does not change the behaviour of the environments in any unexpected way.""" NUM_STEPS = 100 - env_v4 = gym.make(env + "-v4") - env_v5 = gym.make(env + "-v5") - env_v4.reset(seed=1234) - env_v5.reset(seed=1234) - for _ in range(NUM_STEPS): - action = env_v4.action_space.sample() - obs_v4, rew_v4, terminal_v4, truncated_v4, info_v4 = env_v4.step(action) - obs_v5, rew_v5, terminal_v5, truncated_v5, info_v5 = env_v5.step(action) - assert obs_v4.shape[0] != 1 and obs_v5.shape[0] != 1 - assert (env_v4.unwrapped.data.qpos == env_v5.unwrapped.data.qpos).all() - assert (env_v4.unwrapped.data.qvel == env_v5.unwrapped.data.qvel).all() - if env not in ["HumanoidStandup", "Reacher"]: # they have different obs - assert (obs_v4 == obs_v5).all() - assert rew_v4 == rew_v5 - assert terminal_v4 == terminal_v5 and truncated_v4 == truncated_v5 + env_v4 = gym.make("Ant-v4") + env_v5 = gym.make("Ant-v5", include_cfrc_ext_in_observation=False) + check_environments_match( + env_v4, env_v5, NUM_STEPS, skip_rew=True, info_comparison="skip" + ) + + env_v4 = gym.make("HalfCheetah-v4") + env_v5 = gym.make("HalfCheetah-v5") + check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") + + # env_v4 = gym.make("Hopper-v4") + # env_v5 = gym.make("Hopper-v5") + # check_environments_match(env_v4, env_v5, num_steps, skip_rew=true, info_comparison="superset") + + # skipping humanoid, everything has changed + + env_v4 = gym.make("HumanoidStandup-v4") + env_v5 = gym.make("HumanoidStandup-v5") + check_environments_match( + env_v4, env_v5, NUM_STEPS, skip_obs=True, info_comparison="superset" + ) + + env_v4 = gym.make("InvertedDoublePendulum-v4") + env_v5 = gym.make("InvertedDoublePendulum-v5") + check_environments_match( + env_v4, + env_v5, + NUM_STEPS, + skip_obs=True, + skip_rew=True, + info_comparison="superset", + ) + + env_v4 = gym.make("InvertedPendulum-v4") + env_v5 = gym.make("InvertedPendulum-v5") + check_environments_match( + env_v4, env_v5, NUM_STEPS, skip_rew=True, info_comparison="superset" + ) + + env_v4 = gym.make("Pusher-v4") + env_v5 = gym.make("Pusher-v5") + check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") + + env_v4 = gym.make("Reacher-v4") + env_v5 = gym.make("Reacher-v5") + check_environments_match( + env_v4, env_v5, NUM_STEPS, skip_obs=True, info_comparison="skip" + ) + + env_v4 = gym.make("Swimmer-v4") + env_v5 = gym.make("Swimmer-v5") + check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") + + # skipping Walker2d, since the model has changed @pytest.mark.parametrize("version", ["v5", "v4"]) -def test_ant_com(version): - """Verify the kinmatic behaviour of the ant""" +def test_ant_com(version: str): + """Verify the kinmatic behaviour of the ant.""" env = gym.make( f"Ant-{version}" ) # `env` contains `data : MjData` and `model : MjModel` @@ -335,8 +391,8 @@ def test_ant_com(version): @pytest.mark.parametrize("version", ["v5", "v4", "v3", "v2"]) -def test_set_state(version): - """Simple Test to verify that `mujocoEnv.set_state()` works correctly""" +def test_set_state(version: str): + """Simple Test to verify that `mujocoEnv.set_state()` works correctly.""" env = gym.make(f"Hopper-{version}") env.reset() new_qpos = np.array( @@ -357,8 +413,8 @@ def test_set_state(version): @pytest.mark.parametrize( "env_id", ["Ant-v5", "Humanoid-v5", "Swimmer-v5", "Swimmer-v4", "Swimmer-v3"] ) -def test_distance_from_origin_info(env_id): - """Verify that `info"distance_from_origin"` is correct""" +def test_distance_from_origin_info(env_id: str): + """Verify that `info"distance_from_origin"` is correct.""" env = gym.make(env_id) env.reset() _, _, _, _, info = env.step(env.action_space.sample()) @@ -369,8 +425,8 @@ def test_distance_from_origin_info(env_id): @pytest.mark.parametrize("env", ["Hopper", "HumanoidStandup", "Walker2d"]) @pytest.mark.parametrize("version", ["v5"]) -def test_z_distance_from_origin_info(env, version): - """Verify that `info"z_distance_from_origin"` is correct""" +def test_z_distance_from_origin_info(env: str, version: str): + """Verify that `info"z_distance_from_origin"` is correct.""" env = gym.make(f"{env}-{version}") env.reset() _, _, _, _, info = env.step(env.action_space.sample()) @@ -396,7 +452,7 @@ def test_z_distance_from_origin_info(env, version): ], ) @pytest.mark.parametrize("version", ["v5"]) -def test_observation_structure(env, version): +def test_observation_structure(env: str, version: str): """Verify that the `env.observation_structure` is properly defined.""" env = gym.make(f"{env}-{version}") if hasattr(env, "observation_structure"): @@ -430,16 +486,20 @@ def test_observation_structure(env, version): "Hopper", "Humanoid", "HumanoidStandup", + # "InvertedDoublePendulum", + # "InvertedPendulum", + # "Pusher", + # "Reacher", "Swimmer", "Walker2d", ], ) @pytest.mark.parametrize("version", ["v5"]) -def test_reset_info(env, version): - """Verify that the environment returns info at `reset()`""" +def test_reset_info(env: str, version: str): + """Verify that the environment returns info with `reset()`.""" env = gym.make(f"{env}-{version}") _, reset_info = env.reset() - assert "x_position" in reset_info + assert len(reset_info) > 0 """ @@ -456,8 +516,8 @@ def test_reset_info(env, version): # Note: the max height used to be wrong in the documentation. @pytest.mark.parametrize("version", ["v5"]) -def test_inverted_double_pendulum_max_height(version): - """Verify the max height of Inverted Double Pendulum""" +def test_inverted_double_pendulum_max_height(version: str): + """Verify the max height of Inverted Double Pendulum.""" env = gym.make(f"InvertedDoublePendulum-{version}", reset_noise_scale=0) env.reset() y = env.unwrapped.data.site_xpos[0][2] @@ -465,8 +525,8 @@ def test_inverted_double_pendulum_max_height(version): @pytest.mark.parametrize("version", ["v4"]) -def test_inverted_double_pendulum_max_height_old(version): - """Verify the max height of Inverted Double Pendulum (v4 does not have `reset_noise_scale` argument)""" +def test_inverted_double_pendulum_max_height_old(version: str): + """Verify the max height of Inverted Double Pendulum (v4 does not have `reset_noise_scale` argument).""" env = gym.make(f"InvertedDoublePendulum-{version}") env.set_state(env.init_qpos, env.init_qvel) y = env.unwrapped.data.site_xpos[0][2] @@ -475,7 +535,7 @@ def test_inverted_double_pendulum_max_height_old(version): # note: fails with `brax==0.9.0` @pytest.mark.parametrize("version", ["v5", "v4"]) -def test_model_object_count(version): +def test_model_object_count(version: str): """Verify that all the objects of the model are loaded, mostly useful for using non-mujoco simulator.""" env = gym.make(f"Ant-{version}") assert env.unwrapped.model.nq == 15 @@ -586,3 +646,102 @@ def test_model_object_count(version): assert env.unwrapped.model.njnt == 9 assert env.unwrapped.model.ngeom == 8 assert env.unwrapped.model.ntendon == 0 + + +""" +def test_dt(): + env_a = gym.make("Ant-v5", include_cfrc_ext_in_observation=False) + env_b = gym.make("Ant-v5", include_cfrc_ext_in_observation=False, frame_skip=1) + env_b.unwrapped.model.opt.timestep = 0.05 + + assert env_a.dt == env_b.dt + check_environments_match(env_a, env_b, num_steps=100) +""" + + +def check_environments_match( + env_a: gym.Env, + env_b: gym.Env, + num_steps: int, + seed: int = 0, + skip_obs: bool = False, + skip_rew: bool = False, + skip_terminal: bool = False, + skip_truncated: bool = False, + info_comparison: str = "equivalence", +): + """Checks if the environments `env_a` & `env_b` are identical. + + Args: + env_a: First environment to check. + env_b: Second environment to check. + num_steps: number of timesteps to test for, setting to 0 tests only resetting. + seed: used the seed the reset & actions. + skip_obs: If `True` it does not check for equivalence of the observation. + skip_rew: If `True` it does not check for equivalence of the observation. + skip_terminal: If `True` it does not check for equivalence of the observation. + skip_truncated: If `True` it does not check for equivalence of the observation. + skip_info: If `True` it does not check for equivalence of the observation. + info_comparison: If "equivalence" then checks if the `info`s are identical, + if "superset" checks if `info_b` is a (non-strict) superset of `info_a` + if "skip" no checks are made at the `info`. + """ + assert info_comparison in ["equivalence", "superset", "skip"] + + assert env_a.action_space == env_b.action_space + assert skip_obs or env_b.observation_space == env_b.observation_space + + env_a.action_space.seed(seed) + obs_a, info_a = env_a.reset(seed=seed) + obs_b, info_b = env_b.reset(seed=seed) + + assert skip_obs or data_equivalence( + obs_a, obs_b + ), "resetting observation is not equivalent" + if info_comparison == "equivalence": + assert data_equivalence(info_a, info_b), "resetting info is not equivalent" + elif info_comparison == "superset": + for key in info_a: + assert data_equivalence( + info_a[key], info_b[key] + ), "resetting info is not a superset" + + for _ in range(num_steps): + action = env_a.action_space.sample() + obs_a, rew_a, terminal_a, truncated_a, info_a = env_a.step(action) + obs_b, rew_b, terminal_b, truncated_b, info_b = env_b.step(action) + assert skip_obs or data_equivalence( + obs_a, obs_b + ), "stepping observation is not equivalent" + assert skip_rew or data_equivalence( + rew_a, rew_b + ), "stepping reward is not equivalent" + assert ( + skip_terminal or terminal_a == terminal_b + ), "stepping terminal is not equivalent" + assert ( + skip_truncated or truncated_a == truncated_b + ), "stepping truncated is not equivalent" + if info_comparison == "equivalence": + assert data_equivalence(info_a, info_b), "stepping info is not equivalent" + elif info_comparison == "superset": + for key in info_a: + assert data_equivalence( + info_a[key], info_b[key] + ), "stepping info is not a superset" + + if terminal_a or truncated_a or terminal_b or truncated_b: + obs_a, info_a = env_a.reset(seed=seed) + obs_b, info_b = env_b.reset(seed=seed) + assert skip_obs or data_equivalence( + obs_a, obs_b + ), "resetting observation is not equivalent" + if info_comparison == "equivalence": + assert data_equivalence( + info_a, info_b + ), "resetting info is not equivalent" + elif info_comparison == "superset": + for key in info_a: + assert data_equivalence( + info_a[key], info_b[key] + ), "resetting info is not a superset" From 071d3a1670b18ee9a091f7752cca15e229f2a213 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Fri, 30 Jun 2023 13:40:14 +0300 Subject: [PATCH 17/53] cleanup --- gymnasium/envs/mujoco/ant_v5.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 978fd62c5..201a88817 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -349,11 +349,11 @@ def terminated(self): def step(self, action): xy_position_before = self.data.body(self._main_body).xpos[:2].copy() # TODO remove after validation - assert (xy_position_before == self.get_body_com("torso")[:2].copy()).all() + # assert (xy_position_before == self.get_body_com("torso")[:2].copy()).all() self.do_simulation(action, self.frame_skip) xy_position_after = self.data.body(self._main_body).xpos[:2].copy() # TODO remove after validation - assert (xy_position_after == self.get_body_com("torso")[:2].copy()).all() + # assert (xy_position_after == self.get_body_com("torso")[:2].copy()).all() xy_velocity = (xy_position_after - xy_position_before) / self.dt x_velocity, y_velocity = xy_velocity From 3b902218bbb21146ff65bba1d173b7733bde4691 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Fri, 30 Jun 2023 14:22:20 +0300 Subject: [PATCH 18/53] remove old file --- .../envs/mujoco/assets/walker2d_v5_old.xml | 68 ------------------- 1 file changed, 68 deletions(-) delete mode 100644 gymnasium/envs/mujoco/assets/walker2d_v5_old.xml diff --git a/gymnasium/envs/mujoco/assets/walker2d_v5_old.xml b/gymnasium/envs/mujoco/assets/walker2d_v5_old.xml deleted file mode 100644 index 180bfaec2..000000000 --- a/gymnasium/envs/mujoco/assets/walker2d_v5_old.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - From ee571a64c69e72592b96f98f6dc7551e65f22f40 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 16:33:36 +0300 Subject: [PATCH 19/53] use `env_match.py` --- tests/envs/mujoco/test_mujoco_v5.py | 90 +---------------------------- 1 file changed, 1 insertion(+), 89 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index bc2e02092..2c51158b0 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -6,7 +6,7 @@ import gymnasium as gym from gymnasium.error import Error -from gymnasium.utils.env_checker import data_equivalence +from gymnasium.utils.env_match import check_environments_match ALL_MUJOCO_ENVS = [ @@ -657,91 +657,3 @@ def test_dt(): assert env_a.dt == env_b.dt check_environments_match(env_a, env_b, num_steps=100) """ - - -def check_environments_match( - env_a: gym.Env, - env_b: gym.Env, - num_steps: int, - seed: int = 0, - skip_obs: bool = False, - skip_rew: bool = False, - skip_terminal: bool = False, - skip_truncated: bool = False, - info_comparison: str = "equivalence", -): - """Checks if the environments `env_a` & `env_b` are identical. - - Args: - env_a: First environment to check. - env_b: Second environment to check. - num_steps: number of timesteps to test for, setting to 0 tests only resetting. - seed: used the seed the reset & actions. - skip_obs: If `True` it does not check for equivalence of the observation. - skip_rew: If `True` it does not check for equivalence of the observation. - skip_terminal: If `True` it does not check for equivalence of the observation. - skip_truncated: If `True` it does not check for equivalence of the observation. - skip_info: If `True` it does not check for equivalence of the observation. - info_comparison: If "equivalence" then checks if the `info`s are identical, - if "superset" checks if `info_b` is a (non-strict) superset of `info_a` - if "skip" no checks are made at the `info`. - """ - assert info_comparison in ["equivalence", "superset", "skip"] - - assert env_a.action_space == env_b.action_space - assert skip_obs or env_b.observation_space == env_b.observation_space - - env_a.action_space.seed(seed) - obs_a, info_a = env_a.reset(seed=seed) - obs_b, info_b = env_b.reset(seed=seed) - - assert skip_obs or data_equivalence( - obs_a, obs_b - ), "resetting observation is not equivalent" - if info_comparison == "equivalence": - assert data_equivalence(info_a, info_b), "resetting info is not equivalent" - elif info_comparison == "superset": - for key in info_a: - assert data_equivalence( - info_a[key], info_b[key] - ), "resetting info is not a superset" - - for _ in range(num_steps): - action = env_a.action_space.sample() - obs_a, rew_a, terminal_a, truncated_a, info_a = env_a.step(action) - obs_b, rew_b, terminal_b, truncated_b, info_b = env_b.step(action) - assert skip_obs or data_equivalence( - obs_a, obs_b - ), "stepping observation is not equivalent" - assert skip_rew or data_equivalence( - rew_a, rew_b - ), "stepping reward is not equivalent" - assert ( - skip_terminal or terminal_a == terminal_b - ), "stepping terminal is not equivalent" - assert ( - skip_truncated or truncated_a == truncated_b - ), "stepping truncated is not equivalent" - if info_comparison == "equivalence": - assert data_equivalence(info_a, info_b), "stepping info is not equivalent" - elif info_comparison == "superset": - for key in info_a: - assert data_equivalence( - info_a[key], info_b[key] - ), "stepping info is not a superset" - - if terminal_a or truncated_a or terminal_b or truncated_b: - obs_a, info_a = env_a.reset(seed=seed) - obs_b, info_b = env_b.reset(seed=seed) - assert skip_obs or data_equivalence( - obs_a, obs_b - ), "resetting observation is not equivalent" - if info_comparison == "equivalence": - assert data_equivalence( - info_a, info_b - ), "resetting info is not equivalent" - elif info_comparison == "superset": - for key in info_a: - assert data_equivalence( - info_a[key], info_b[key] - ), "resetting info is not a superset" From 676bf0f75a47586078fa016cee1242a7e6b96cf8 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 21:35:12 +0300 Subject: [PATCH 20/53] doc typo fixes --- gymnasium/envs/mujoco/ant_v5.py | 19 +++++++++---------- gymnasium/envs/mujoco/half_cheetah_v5.py | 10 +++++----- gymnasium/envs/mujoco/hopper_v5.py | 2 +- gymnasium/envs/mujoco/humanoid_v5.py | 2 +- gymnasium/envs/mujoco/swimmer_v5.py | 2 +- gymnasium/envs/mujoco/walker2d_v5.py | 2 +- 6 files changed, 18 insertions(+), 19 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 201a88817..d8c089e01 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -114,7 +114,7 @@ class AntEnv(MujocoEnv, utils.EzPickle): | 13 |12 | ankle_4 (back right leg) | The (x,y,z) coordinates are translational DOFs while the orientations are rotational - DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). + DOFs expressed as quaternions. One can read more about free joints in the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). **Note:** Ant-v4+ environment no longer has the following contact forces issue. @@ -125,36 +125,35 @@ class AntEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of three parts: + The reward consists of four parts: - *healthy_reward*: - Every timestep that the ant is healthy (see definition in section "Episode Termination"), + Every timestep that the Ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`. - *forward_reward*: - A reward of moving forward, + A reward for moving forward, this reward would be positive if the Ant moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the `main_body` ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 5), - and `frametime` which is 0.01 - making the default $dt = 5 \times 0.01 = 0.05$, + $dt$ is the time between actions, which is depends on the `frame_skip` parameter (default is 5), + and `frametime`, which is 0.01 - so the default is $dt = 5 \times 0.01 = 0.05$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: - A negative reward for penalizing the Ant if it takes actions that are too large. + A negative reward to penalize the Ant for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $0.5$). - *contact_cost*: - A negative reward for penalizing the Ant if the external contact forces are too large. + A negative reward to penalize the Ant if the external contact forces are too large. $w_{contact} \times \\|F_{contact}\\|_2^2$, where $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-4}$), $F_{contact}$ are the external contact forces clipped by `contact_force_range` (see `cfrc_ext` section on observation). The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. + and `info` will also contain the individual reward terms. But if `use_contact_forces=false` on `v4` The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. - In either case `info` will also contain the individual reward terms. - ## Starting State All observations start in state diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py index b02082bfe..a9166492a 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v5.py +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -78,17 +78,17 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): ## Rewards The reward consists of two parts: - *forward_reward*: - A reward of moving forward, + A reward for moving forward, this reward would be positive if the Half Cheetah moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the "tip" ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 5), - and `frametime` which is 0.01 - making the default $dt = 5 \times 0.01 = 0.05$, + $dt$ is the time between actions which is depends on the `frame_skip` parameter (default is 5), + and `frametime` which is 0.01 - so the default is $dt = 5 \times 0.01 = 0.05$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: - A negative reward for penalizing the Half Cheetah if it takes actions that are too large. + A negative reward to penalize the Half Cheetah for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, - where $w_{control}$ is `ctrl_cost_weight` (default is $0.1$). + where $w_{control}$ is the `ctrl_cost_weight` (default is $0.1$). The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost*, and `info` will also contain the individual reward terms. diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 082a77c64..7339a3e08 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -76,7 +76,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): Every timestep that the Hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`. - *forward_reward*: - A reward of moving forward, + A reward for moving forward, this reward would be positive if the Hopper moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the "torso" ($x_{after-action} - x_{before-action}$), diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index 6c4245d99..306b3dfc2 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -198,7 +198,7 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): Every timestep that the Humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward`. - *forward_reward*: - A reward of moving forward, + A reward for moving forward, this reward would be positive if the Humanoid moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the center of mass ($x_{after-action} - x_{before-action}$), diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 7f64fa113..5d57399bd 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -75,7 +75,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): ## Rewards The reward consists of two parts: - *forward_reward*: - A reward of moving forward, + A reward for moving forward, this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index 8c286841c..7725929e2 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -82,7 +82,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): - *healthy_reward*: Every timestep that the Walker2d is alive, it receives a fixed reward of value `healthy_reward`, - *forward_reward*: - A reward of moving forward, + A reward for moving forward, this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), From 4088d6c4d82f86e54915259aec1b1698a3d967d4 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 21:37:39 +0300 Subject: [PATCH 21/53] typo --- gymnasium/envs/mujoco/humanoid_v5.py | 2 +- gymnasium/envs/mujoco/humanoidstandup_v5.py | 2 +- gymnasium/envs/mujoco/swimmer_v5.py | 2 +- gymnasium/envs/mujoco/walker2d_v5.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index 306b3dfc2..929048a37 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -202,7 +202,7 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Humanoid moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the center of mass ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 5), + $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 5), and `frametime` which is 0.001 - making the default $dt = 5 \times 0.003 = 0.015$, $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). The calculation for the center of mass is defined in the `.py` file for the Humanoid. diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index 72a419046..ecacbb704 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -199,7 +199,7 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): but it is an absolute reward which measures how much upward the Humanoid has moved overall. It is measured as $weight_{uph} \times (z_{after action} - 0)/dt$, where $z_{after action}$ is the z coordinate of the torso after taking an action, - and *dt* is the time between actions and is dependent on the `frame_skip` parameter + and *dt* is the time between actions, and is dependent on the `frame_skip` parameter (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*. and $weight_{uph}$ is `uph_cost_weight`. - *quad_ctrl_cost*: diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 5d57399bd..901403aad 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -79,7 +79,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 4), + $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), and `frametime` which is 0.01 - making the default $dt = 4 \times 0.01 = 0.04$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index 7725929e2..fd7f6a87d 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -86,7 +86,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 4), + $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), and `frametime` which is 0.002 - making the default $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: From 8ce48de17cd805473deb3304a6b06cc6f2e04b12 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 21:57:51 +0300 Subject: [PATCH 22/53] typo --- gymnasium/envs/mujoco/hopper_v5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 7339a3e08..d563d6691 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -80,7 +80,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Hopper moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the "torso" ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions which is dependent on the `frame_skip` parameter (default is 4), + $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), and `frametime` which is 0.002 - making the default $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: From 6f8f0124bc10ccdfa2a538c915ac515f96569c5b Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 22:05:00 +0300 Subject: [PATCH 23/53] typo --- gymnasium/envs/mujoco/ant_v5.py | 1 - gymnasium/envs/mujoco/swimmer_v5.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index d8c089e01..3537b3286 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -147,7 +147,6 @@ class AntEnv(MujocoEnv, utils.EzPickle): $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-4}$), $F_{contact}$ are the external contact forces clipped by `contact_force_range` (see `cfrc_ext` section on observation). - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. and `info` will also contain the individual reward terms. diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 901403aad..d1d857a41 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -88,7 +88,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-4}$). The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost*, - and `info` will also contain the individual reward terms + and `info` will also contain the individual reward terms. ## Starting State From 215f0d396b8072c94a63378d0894d5e498530d1c Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 22:48:40 +0300 Subject: [PATCH 24/53] doc --- gymnasium/envs/mujoco/hopper_v5.py | 2 +- gymnasium/envs/mujoco/humanoid_v5.py | 2 +- gymnasium/envs/mujoco/humanoidstandup_v5.py | 2 +- gymnasium/envs/mujoco/swimmer_v5.py | 2 +- gymnasium/envs/mujoco/walker2d_v5.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index d563d6691..7bdae99f5 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -81,7 +81,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the "torso" ($x_{after-action} - x_{before-action}$), $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), - and `frametime` which is 0.002 - making the default $dt = 4 \times 0.002 = 0.008$, + and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: A negative reward for penalizing the Hopper if it takes actions that are too large. diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index 929048a37..b21bd18a3 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -203,7 +203,7 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the center of mass ($x_{after-action} - x_{before-action}$), $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 5), - and `frametime` which is 0.001 - making the default $dt = 5 \times 0.003 = 0.015$, + and `frametime` which is 0.001 - so the default is $dt = 5 \times 0.003 = 0.015$, $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). The calculation for the center of mass is defined in the `.py` file for the Humanoid. - *ctrl_cost*: diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index ecacbb704..9b1c202d3 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -200,7 +200,7 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): It is measured as $weight_{uph} \times (z_{after action} - 0)/dt$, where $z_{after action}$ is the z coordinate of the torso after taking an action, and *dt* is the time between actions, and is dependent on the `frame_skip` parameter - (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*. + (default is 5), where the frametime is 0.003 - so the default is *dt = 5 * 0.003 = 0.015*. and $weight_{uph}$ is `uph_cost_weight`. - *quad_ctrl_cost*: A negative reward for penalizing the Humanoid if it takes actions that are too large. diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index d1d857a41..9cd48ff78 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -80,7 +80,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), - and `frametime` which is 0.01 - making the default $dt = 4 \times 0.01 = 0.04$, + and `frametime` which is 0.01 - so the default is $dt = 4 \times 0.01 = 0.04$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: A negative reward for penalizing the Swimmer if it takes actions that are too large. diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index fd7f6a87d..397d52bd4 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -87,7 +87,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), - and `frametime` which is 0.002 - making the default $dt = 4 \times 0.002 = 0.008$, + and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: A negative reward for penalizing the Walker2d if it takes actions that are too large. From 8016f4004e2f7c6bf7345ea55fc8dfcb9a694c56 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 22:52:33 +0300 Subject: [PATCH 25/53] typo --- gymnasium/envs/mujoco/hopper_v5.py | 2 +- gymnasium/envs/mujoco/humanoid_v5.py | 2 +- gymnasium/envs/mujoco/humanoidstandup_v5.py | 2 +- gymnasium/envs/mujoco/swimmer_v5.py | 2 +- gymnasium/envs/mujoco/walker2d_v5.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 7bdae99f5..dcf3fb70d 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -80,7 +80,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Hopper moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the "torso" ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 4), and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index b21bd18a3..a117782de 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -202,7 +202,7 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Humanoid moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the center of mass ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 5), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 5), and `frametime` which is 0.001 - so the default is $dt = 5 \times 0.003 = 0.015$, $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). The calculation for the center of mass is defined in the `.py` file for the Humanoid. diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index 9b1c202d3..9f6491827 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -199,7 +199,7 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): but it is an absolute reward which measures how much upward the Humanoid has moved overall. It is measured as $weight_{uph} \times (z_{after action} - 0)/dt$, where $z_{after action}$ is the z coordinate of the torso after taking an action, - and *dt* is the time between actions, and is dependent on the `frame_skip` parameter + and *dt* is the time between actions, and depends on the `frame_skip` parameter (default is 5), where the frametime is 0.003 - so the default is *dt = 5 * 0.003 = 0.015*. and $weight_{uph}$ is `uph_cost_weight`. - *quad_ctrl_cost*: diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 9cd48ff78..9af6e635d 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -79,7 +79,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 4), and `frametime` which is 0.01 - so the default is $dt = 4 \times 0.01 = 0.04$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index 397d52bd4..c658eb5b4 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -86,7 +86,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). $w_{forward} \times \frac{dx}{dt}$, where $dx$ is the displacement of the (front) "tip" ($x_{after-action} - x_{before-action}$), - $dt$ is the time between actions, which is dependent on the `frame_skip` parameter (default is 4), + $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 4), and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: From 640eeca686eab4c62fb7b3af3b8b47f0d9a4045f Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 23:15:37 +0300 Subject: [PATCH 26/53] doc --- gymnasium/envs/mujoco/hopper_v5.py | 2 +- gymnasium/envs/mujoco/humanoid_v5.py | 4 ++-- gymnasium/envs/mujoco/humanoidstandup_v5.py | 4 ++-- gymnasium/envs/mujoco/pusher_v5.py | 2 +- gymnasium/envs/mujoco/reacher_v5.py | 2 +- gymnasium/envs/mujoco/swimmer_v5.py | 2 +- gymnasium/envs/mujoco/walker2d_v5.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index dcf3fb70d..a937ade49 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -84,7 +84,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: - A negative reward for penalizing the Hopper if it takes actions that are too large. + A negative reward to penalize the Hopper for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index a117782de..a47937f74 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -207,12 +207,12 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). The calculation for the center of mass is defined in the `.py` file for the Humanoid. - *ctrl_cost*: - A negative reward for penalizing the Humanoid if it takes actions that are too large. + A negative reward to penalize the Humanoid for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $0.1$). If there are *nu* actuators/controls, then the control has shape `nu x 1`. - *contact_cost*: - A negative reward for penalizing the Humanoid if the external contact forces are too large. + A negative reward to penalize the Humanoid if the external contact forces are too large. $w_{contact} \times clamp(contact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-7}$), $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index 9f6491827..bdae6fa6e 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -203,12 +203,12 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): (default is 5), where the frametime is 0.003 - so the default is *dt = 5 * 0.003 = 0.015*. and $weight_{uph}$ is `uph_cost_weight`. - *quad_ctrl_cost*: - A negative reward for penalizing the Humanoid if it takes actions that are too large. + A negative reward to penalize the Humanoid for taking actions that are too large. $w_{quad_control} \times \\|action\\|_2^2$, where $w_{quad_control}$ is `ctrl_cost_weight` (default is $0.1$). If there are *nu* actuators/controls, then the control has shape `nu x 1`. - *impact_cost*: - A negative reward for penalizing the Humanoid if the external contact forces are too large. + A negative reward to penalize the Humanoid if the external contact forces are too large. $w_{impact} \times clamp(impact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where $w_{impact}$ is `impact_cost_weight` (default is $5\times10^{-7}$), $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index f0fc6b059..9874e41b3 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -88,7 +88,7 @@ class PusherEnv(MujocoEnv, utils.EzPickle): It is $-w_{dist} \|(P_{object} - P_{target})\|_2$. where $w_{dist}$ is `reward_dist_weight`. - *reward_control*: - A negative reward for penalising the pusher if it takes actions that are too large. + A negative reward to penalize the pusher for taking actions that are too large. It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. where $w_{control}$ is `reward_control_weight`. diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index 2227e848d..cffe30b1f 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -73,7 +73,7 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): It is $-w_{near} \|(P_{fingertip} - P_{target})\|_2$. where $w_{near}$ is `reward_near_weight`. - *reward_control*: - A negative reward for penalising the walker if it takes actions that are too large. + A negative reward to penalize the walker for taking actions that are too large. It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. where $w_{control}$ is `reward_control_weight`. diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 9af6e635d..3c4764eaf 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -83,7 +83,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): and `frametime` which is 0.01 - so the default is $dt = 4 \times 0.01 = 0.04$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: - A negative reward for penalizing the Swimmer if it takes actions that are too large. + A negative reward to penalize the Swimmer for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-4}$). diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index c658eb5b4..a536e862e 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -90,7 +90,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): and `frametime` which is 0.002 - so the default is $dt = 4 \times 0.002 = 0.008$, $w_{forward}$ is the `forward_reward_weight` (default is $1$). - *ctrl_cost*: - A negative reward for penalizing the Walker2d if it takes actions that are too large. + A negative reward to penalize the Walker2d for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). From 26672f9217091afb9560f7704696d7e3ac317e83 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 1 Jul 2023 23:32:29 +0300 Subject: [PATCH 27/53] cleanup --- gymnasium/envs/mujoco/humanoid_v5.py | 4 ++-- gymnasium/envs/mujoco/humanoidstandup_v5.py | 6 +++--- gymnasium/envs/mujoco/inverted_double_pendulum_v5.py | 7 +++---- gymnasium/envs/mujoco/inverted_pendulum_v5.py | 8 ++++---- gymnasium/envs/mujoco/pusher_v5.py | 8 ++++---- gymnasium/envs/mujoco/reacher_v5.py | 6 +++--- 6 files changed, 19 insertions(+), 20 deletions(-) diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index a47937f74..6dc196be8 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -205,12 +205,12 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 5), and `frametime` which is 0.001 - so the default is $dt = 5 \times 0.003 = 0.015$, $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). - The calculation for the center of mass is defined in the `.py` file for the Humanoid. + The center of mass calculation is defined in the `.py` file for the Humanoid. - *ctrl_cost*: A negative reward to penalize the Humanoid for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $0.1$). - If there are *nu* actuators/controls, then the control has shape `nu x 1`. + If there are *nu* actuators/controls, then the control has shape `nu x 1`. - *contact_cost*: A negative reward to penalize the Humanoid if the external contact forces are too large. $w_{contact} \times clamp(contact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index bdae6fa6e..8d2b967c0 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -194,9 +194,9 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): ## Rewards The reward consists of three parts: - *uph_cost*: - A reward for moving upward (in an attempt to stand up). - This is not a relative reward which measures how much upward it has moved from the last timestep, - but it is an absolute reward which measures how much upward the Humanoid has moved overall. + A reward for moving up (trying to stand up). + This is not a relative reward, measuring how far up it has moved since the last timestep, + but it is an absolute reward that measures how much upward the Humanoid has moved up in total. It is measured as $weight_{uph} \times (z_{after action} - 0)/dt$, where $z_{after action}$ is the z coordinate of the torso after taking an action, and *dt* is the time between actions, and depends on the `frame_skip` parameter diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py index 8e0d4eaaf..a5337b8b2 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -74,15 +74,14 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): ## Rewards The reward consists of two parts: - *alive_bonus*: - The goal is to make the second inverted pendulum stand upright - (within a certain angle limit) as long as possible - as such a reward of `healthy_reward` is awarded - for each timestep that the second pole is upright. + The goal is to keep the second inverted pendulum upright (within a certain angle limit) as long as possible - + so for each timestep that the second pole is upright, a reward of `healthy_reward` is given. - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum (the only free end) moves, and it is calculated as $0.01 x_{pole2-tip}^2 + (y_{pole2-tip}-2)^2$, where $x_{pole2-tip}, y_{pole2-tip}$ are the xy-coordinatesof the tip of the second pole. - *velocity_penalty*: - A negative reward for penalising the agent if it moves too fast. + A negative reward to penalize the agent for moving too fast. $10^{-3} \omega_1 + 5 10^{-3} \omega_2$, where $\omega_1, \omega_2$ are the angular velocities of the hinges. diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_pendulum_v5.py index 4be166b0c..75b964755 100644 --- a/gymnasium/envs/mujoco/inverted_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_pendulum_v5.py @@ -54,14 +54,14 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): ## Rewards - The goal is to make the inverted pendulum stand upright (within a certain angle limit) - as long as possible - as such a reward of +1 is awarded for each timestep that + The goal is to keep the inverted pendulum stand upright (within a certain angle limit) + for as long as possible - as such a reward of +1 is awarded for each timestep that the pole is upright. The pole is considered upright if: - $|angle| < 0.2$ + $|angle| < 0.2$. - and `info` will also contain the reward. + and `info` also contains the reward. ## Starting State diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index 9874e41b3..72744d192 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -81,16 +81,16 @@ class PusherEnv(MujocoEnv, utils.EzPickle): This reward is a measure of how far the *fingertip* of the pusher (the unattached end) is from the object, with a more negative value assigned for when the pusher's *fingertip* is further away from the target. It is $-w_{near} \|(P_{fingertip} - P_{target})\|_2$. - where $w_{near}$ is `reward_near_weight`. + where $w_{near}$ is the `reward_near_weight`. - *reward_dist*: This reward is a measure of how far the object is from the target goal position, - with a more negative value assigned for object that is further away from the target. + with a more negative value assigned if the object that is further away from the target. It is $-w_{dist} \|(P_{object} - P_{target})\|_2$. - where $w_{dist}$ is `reward_dist_weight`. + where $w_{dist}$ is the `reward_dist_weight`. - *reward_control*: A negative reward to penalize the pusher for taking actions that are too large. It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. - where $w_{control}$ is `reward_control_weight`. + where $w_{control}$ is the `reward_control_weight`. The total reward returned is ***reward*** *=* *reward_dist + reward_ctrl + reward_near*, `info` will also contain the individual reward terms. diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index cffe30b1f..ac2242bc7 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -69,13 +69,13 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): The reward consists of two parts: - *reward_distance*: This reward is a measure of how far the *fingertip* of the reacher (the unattached end) is from the target, - with a more negative value assigned for when the reacher's *fingertip* is further away from the target. + with a more negative value assigned for if the reacher's *fingertip* is further away from the target. It is $-w_{near} \|(P_{fingertip} - P_{target})\|_2$. - where $w_{near}$ is `reward_near_weight`. + where $w_{near}$ is the `reward_near_weight`. - *reward_control*: A negative reward to penalize the walker for taking actions that are too large. It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. - where $w_{control}$ is `reward_control_weight`. + where $w_{control}$ is the `reward_control_weight`. The total reward returned is ***reward*** *=* *reward_distance + reward_control*. `info` will also contain the individual reward terms. From 33d62fa8b2c18a0d6feb3b472e630b7be6dd635a Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 3 Jul 2023 16:10:10 +0300 Subject: [PATCH 28/53] update based on @pseudo-rnd-thoughts review --- gymnasium/envs/mujoco/ant_v5.py | 19 +++---------------- gymnasium/envs/mujoco/hopper_v5.py | 4 ---- gymnasium/envs/mujoco/humanoid_v5.py | 10 ---------- gymnasium/envs/mujoco/humanoidstandup_v5.py | 7 ------- .../mujoco/inverted_double_pendulum_v5.py | 2 -- gymnasium/envs/mujoco/pusher_v5.py | 2 +- gymnasium/envs/mujoco/reacher_v5.py | 5 +---- gymnasium/envs/mujoco/swimmer_v5.py | 2 +- gymnasium/envs/mujoco/walker2d_v5.py | 8 ++------ tests/envs/mujoco/test_mujoco_v5.py | 2 +- 10 files changed, 9 insertions(+), 52 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 3537b3286..6d0bb4fe7 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -172,7 +172,7 @@ class AntEnv(MujocoEnv, utils.EzPickle): 1. Any of the state space values is no longer finite 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0]) - #### truncation + #### Truncation The maximum duration of an episode is 1000 timesteps. @@ -338,35 +338,27 @@ def is_healthy(self): @property def terminated(self): terminated = (not self.is_healthy) and self._terminate_when_unhealthy - # TODO remove after validation - assert terminated == ( - not self.is_healthy if self._terminate_when_unhealthy else False - ) return terminated def step(self, action): xy_position_before = self.data.body(self._main_body).xpos[:2].copy() - # TODO remove after validation - # assert (xy_position_before == self.get_body_com("torso")[:2].copy()).all() self.do_simulation(action, self.frame_skip) xy_position_after = self.data.body(self._main_body).xpos[:2].copy() - # TODO remove after validation - # assert (xy_position_after == self.get_body_com("torso")[:2].copy()).all() xy_velocity = (xy_position_after - xy_position_before) / self.dt x_velocity, y_velocity = xy_velocity forward_reward = x_velocity * self._forward_reward_weight healthy_reward = self.healthy_reward - rewards = forward_reward + healthy_reward ctrl_cost = self.control_cost(action) contact_cost = self.contact_cost costs = ctrl_cost + contact_cost - terminated = self.terminated observation = self._get_obs() + reward = rewards - costs + terminated = self.terminated info = { "reward_forward": forward_reward, "reward_ctrl": -ctrl_cost, @@ -379,8 +371,6 @@ def step(self, action): "y_velocity": y_velocity, } - reward = rewards - costs - if self.render_mode == "human": self.render() return observation, reward, terminated, False, info @@ -393,9 +383,6 @@ def _get_obs(self): position = position[2:] if self._include_cfrc_ext_in_observation: - assert ( - self.contact_forces[0].flat.copy() == 0 - ).all() # TODO remove after validation contact_force = self.contact_forces[1:].flat.copy() assert len(contact_force) == 78 return np.concatenate((position, velocity, contact_force)) diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index a937ade49..aab4c5236 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -259,10 +259,6 @@ def is_healthy(self): @property def terminated(self): terminated = (not self.is_healthy) and self._terminate_when_unhealthy - # TODO remove after validation - assert terminated == ( - not self.is_healthy if self._terminate_when_unhealthy else False - ) return terminated def _get_obs(self): diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index 6dc196be8..f3548c747 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -415,10 +415,6 @@ def is_healthy(self): @property def terminated(self): terminated = (not self.is_healthy) and self._terminate_when_unhealthy - # TODO remove after validation - assert terminated == ( - not self.is_healthy if self._terminate_when_unhealthy else False - ) return terminated def _get_obs(self): @@ -443,12 +439,6 @@ def _get_obs(self): else: external_contact_forces = np.array([]) - # TODO remove after validation - assert (self.data.cinert[0].flat.copy() == 0).all() - assert (self.data.cvel[0].flat.copy() == 0).all() - assert (self.data.qfrc_actuator[:6].flat.copy() == 0).all() - assert (self.data.cfrc_ext[0].flat.copy() == 0).all() - if self._exclude_current_positions_from_observation: position = position[2:] diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index 8d2b967c0..a5b4e54de 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -395,12 +395,6 @@ def _get_obs(self): else: external_contact_forces = np.array([]) - # TODO remove after validation - assert (self.data.cinert[0].flat.copy() == 0).all() - assert (self.data.cvel[0].flat.copy() == 0).all() - assert (self.data.qfrc_actuator[:6].flat.copy() == 0).all() - assert (self.data.cfrc_ext[0].flat.copy() == 0).all() - if self._exclude_current_positions_from_observation: position = position[2:] @@ -430,7 +424,6 @@ def step(self, action): quad_impact_cost = np.clip(quad_impact_cost, min_impact_cost, max_impact_cost) reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1 - info = { "reward_linup": uph_cost, "reward_quadctrl": -quad_ctrl_cost, diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py index a5337b8b2..5c52d3b05 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -197,8 +197,6 @@ def step(self, action): return observation, reward, terminated, False, info def _get_obs(self): - assert self.data.qfrc_constraint[2] == 0 # TODO remove after validation - assert self.data.qfrc_constraint[1] == 0 # TODO remove after validation return np.concatenate( [ self.data.qpos[:1], # cart x pos diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index 72744d192..93a4c32f3 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -198,11 +198,11 @@ def step(self, action): reward_near = -np.linalg.norm(vec_1) * self._reward_near_weight reward_dist = -np.linalg.norm(vec_2) * self._reward_dist_weight reward_ctrl = -np.square(action).sum() * self._reward_control_weight - reward = reward_dist + reward_ctrl + reward_near self.do_simulation(action, self.frame_skip) observation = self._get_obs() + reward = reward_dist + reward_ctrl + reward_near info = { "reward_dist": reward_dist, "reward_ctrl": reward_ctrl, diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index ac2242bc7..e986183d4 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -175,11 +175,11 @@ def step(self, action): vec = self.get_body_com("fingertip") - self.get_body_com("target") reward_dist = -np.linalg.norm(vec) * self._reward_dist_weight reward_ctrl = -np.square(action).sum() * self._reward_control_weight - reward = reward_dist + reward_ctrl self.do_simulation(action, self.frame_skip) observation = self._get_obs() + reward = reward_dist + reward_ctrl info = { "reward_dist": reward_dist, "reward_ctrl": reward_ctrl, @@ -207,9 +207,6 @@ def reset_model(self): def _get_obs(self): theta = self.data.qpos.flat[:2] - assert (self.get_body_com("fingertip") - self.get_body_com("target"))[ - 2 - ] == 0 # TODO remove after validation return np.concatenate( [ np.cos(theta), diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 3c4764eaf..8b647124a 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -99,7 +99,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): #### Termination The Swimmer never terminates. - #### truncation + #### Truncation The maximum duration of an episode is 1000 timesteps. diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index a536e862e..d2d12a0ed 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -105,7 +105,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): ## Episode End - #### termination + #### Termination If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Walker2d is unhealthy. The Walker2d is unhealthy if any of the following happens: @@ -113,7 +113,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): 2. The height of the walker is ***not*** in the closed interval specified by `healthy_z_range` 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` - #### truncation + #### Truncation the maximum duration of an episode is 1000 timesteps. If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. @@ -260,10 +260,6 @@ def is_healthy(self): @property def terminated(self): terminated = (not self.is_healthy) and self._terminate_when_unhealthy - # TODO remove after validation - assert terminated == ( - not self.is_healthy if self._terminate_when_unhealthy else False - ) return terminated def _get_obs(self): diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index 2c51158b0..a2f5e45aa 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -306,7 +306,7 @@ def test_reward_sum(version: str): ) -# Note: the environtments that are not present, is because they do not have identical behaviour +# Note: the environtments "HalfCheetah", "Pusher", "Swimmer" def test_identical_behaviour_v45(): """Verify that v4 -> v5 transition. Does not change the behaviour of the environments in any unexpected way.""" NUM_STEPS = 100 From 4184838467a6e11f0fb5c0e04fa5b6ecfffabb9b Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 3 Jul 2023 17:44:23 +0300 Subject: [PATCH 29/53] typo fix --- gymnasium/envs/mujoco/ant_v5.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 6d0bb4fe7..6a2c420c1 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -384,7 +384,6 @@ def _get_obs(self): if self._include_cfrc_ext_in_observation: contact_force = self.contact_forces[1:].flat.copy() - assert len(contact_force) == 78 return np.concatenate((position, velocity, contact_force)) else: return np.concatenate((position, velocity)) From c4b3d4ab70c241eca9d22200156d8c50ce177819 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 4 Jul 2023 01:27:42 +0300 Subject: [PATCH 30/53] cleanup tests --- tests/envs/mujoco/test_mujoco_v5.py | 36 +++++++++++------------------ 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index a2f5e45aa..1559bd8c1 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -170,9 +170,9 @@ def test_verify_reward_survive(env: str, version: str): """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states.""" env = gym.make(f"{env}-{version}", reset_noise_scale=0) env.reset(seed=0) - env.action_space.seed(0) + env.action_space.seed(1) - for step in range(175): + for step in range(80): obs, rew, terminal, truncated, info = env.step(env.action_space.sample()) if terminal: @@ -213,7 +213,7 @@ def test_frame_skip(env: str, version: str, frame_skip: int): raise Error(f"Unexpected warning: {warning.message}") -# Dev Note: This can be version env parametrized because each env has it's own reward function +# Dev Note: This can not be version env parametrized because each env has it's own reward function @pytest.mark.parametrize("version", ["v5"]) def test_reward_sum(version: str): """Assert that the total reward equals the sum of the individual reward terms, also asserts that the reward function has no fp ordering arithmetic errors.""" @@ -321,9 +321,11 @@ def test_identical_behaviour_v45(): env_v5 = gym.make("HalfCheetah-v5") check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") - # env_v4 = gym.make("Hopper-v4") - # env_v5 = gym.make("Hopper-v5") - # check_environments_match(env_v4, env_v5, num_steps, skip_rew=true, info_comparison="superset") + env_v4 = gym.make("Hopper-v4") + env_v5 = gym.make("Hopper-v5") + check_environments_match( + env_v4, env_v5, NUM_STEPS, skip_rew=True, info_comparison="superset" + ) # skipping humanoid, everything has changed @@ -352,19 +354,21 @@ def test_identical_behaviour_v45(): env_v4 = gym.make("Pusher-v4") env_v5 = gym.make("Pusher-v5") - check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") + check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="keys-superset") env_v4 = gym.make("Reacher-v4") env_v5 = gym.make("Reacher-v5") check_environments_match( - env_v4, env_v5, NUM_STEPS, skip_obs=True, info_comparison="skip" + env_v4, env_v5, NUM_STEPS, skip_obs=True, info_comparison="keys-equivalence" ) env_v4 = gym.make("Swimmer-v4") env_v5 = gym.make("Swimmer-v5") check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") - # skipping Walker2d, since the model has changed + env_v4 = gym.make("Walker2d-v4") + env_v5 = gym.make("Walker2d-v5") + check_environments_match(env_v4, env_v5, NUM_STEPS, skip_obs=True, skip_rew=True, skip_terminal=True, info_comparison="keys-superset") @pytest.mark.parametrize("version", ["v5", "v4"]) @@ -502,19 +506,7 @@ def test_reset_info(env: str, version: str): assert len(reset_info) > 0 -""" -[Bug Report] [Documentation] Inverted Double Pendulum max Height is wrong - -The Documentation States: -```md -The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other) -``` -but the height of each pole is 0.6 (0.6+0.6==1.2) -https://github.com/Farama-Foundation/Gymnasium/blob/deb50802facfd827abd4d1f0cf1069afb12a726b/gymnasium/envs/mujoco/assets/inverted_double_pendulum.xml#L33-L39 -""" - - -# Note: the max height used to be wrong in the documentation. +# Note: the max height used to be wrong in the documentation. (1.196m instead of 1.2m) @pytest.mark.parametrize("version", ["v5"]) def test_inverted_double_pendulum_max_height(version: str): """Verify the max height of Inverted Double Pendulum.""" From 6edadfa500499048a1b5c05cfed65b2925522041 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 4 Jul 2023 22:42:40 +0300 Subject: [PATCH 31/53] black --- tests/envs/mujoco/test_mujoco_v5.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index 1559bd8c1..99fac8aa1 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -368,7 +368,15 @@ def test_identical_behaviour_v45(): env_v4 = gym.make("Walker2d-v4") env_v5 = gym.make("Walker2d-v5") - check_environments_match(env_v4, env_v5, NUM_STEPS, skip_obs=True, skip_rew=True, skip_terminal=True, info_comparison="keys-superset") + check_environments_match( + env_v4, + env_v5, + NUM_STEPS, + skip_obs=True, + skip_rew=True, + skip_terminal=True, + info_comparison="keys-superset", + ) @pytest.mark.parametrize("version", ["v5", "v4"]) From 4defe7a117192dc3c26a13cc1ca1b835006d1b57 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sat, 8 Jul 2023 01:27:30 +0300 Subject: [PATCH 32/53] fix typing issues --- tests/envs/mujoco/test_mujoco_v5.py | 38 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index 99fac8aa1..3fda2525b 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -80,11 +80,11 @@ def test_verify_info_y_position(env_id: str): # Note: "HumnanoidStandup-v4" does not have `info` -@pytest.mark.parametrize("env", ["HalfCheetah", "Hopper", "Swimmer", "Walker2d"]) +@pytest.mark.parametrize("env_name", ["HalfCheetah", "Hopper", "Swimmer", "Walker2d"]) @pytest.mark.parametrize("version", ["v5", "v4", "v3"]) -def test_verify_info_x_velocity(env: str, version: str): +def test_verify_info_x_velocity(env_name: str, version: str): """Asserts that the environment `info['x_velocity']` is properly assigned.""" - env = gym.make(f"{env}-{version}") + env = gym.make(f"{env_name}-{version}") env.reset() old_x = env.unwrapped.data.qpos[0] @@ -155,7 +155,7 @@ def mass_center(model, data): # Note: Inverted(Double)Pendulum-v4/2 does not have `info['reward_survive']`, but it is still affected # Note: all `v4/v3/v2` environments with a heathly reward are fail this test @pytest.mark.parametrize( - "env", + "env_name", [ "Ant", "Hopper", @@ -166,9 +166,9 @@ def mass_center(model, data): ], ) @pytest.mark.parametrize("version", ["v5"]) -def test_verify_reward_survive(env: str, version: str): +def test_verify_reward_survive(env_name: str, version: str): """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states.""" - env = gym.make(f"{env}-{version}", reset_noise_scale=0) + env = gym.make(f"{env_name}-{version}", reset_noise_scale=0) env.reset(seed=0) env.action_space.seed(1) @@ -196,12 +196,12 @@ def test_verify_reward_survive(env: str, version: str): ] -@pytest.mark.parametrize("env", ALL_MUJOCO_ENVS) +@pytest.mark.parametrize("env_name", ALL_MUJOCO_ENVS) @pytest.mark.parametrize("version", ["v5"]) @pytest.mark.parametrize("frame_skip", [1, 2, 3, 4, 5]) -def test_frame_skip(env: str, version: str, frame_skip: int): +def test_frame_skip(env_name: str, version: str, frame_skip: int): """Verify that all `mujoco` envs work with different `frame_skip` values.""" - env_id = f"{env}-{version}" + env_id = f"{env_name}-{version}" env = gym.make(env_id, frame_skip=frame_skip) # Test if env adheres to Gym API @@ -306,7 +306,7 @@ def test_reward_sum(version: str): ) -# Note: the environtments "HalfCheetah", "Pusher", "Swimmer" +# Note: the environtments "HalfCheetah", "Pusher", "Swimmer", are identical between `v4` & `v5` (excluding `info`) def test_identical_behaviour_v45(): """Verify that v4 -> v5 transition. Does not change the behaviour of the environments in any unexpected way.""" NUM_STEPS = 100 @@ -435,11 +435,11 @@ def test_distance_from_origin_info(env_id: str): ) -@pytest.mark.parametrize("env", ["Hopper", "HumanoidStandup", "Walker2d"]) +@pytest.mark.parametrize("env_name", ["Hopper", "HumanoidStandup", "Walker2d"]) @pytest.mark.parametrize("version", ["v5"]) -def test_z_distance_from_origin_info(env: str, version: str): +def test_z_distance_from_origin_info(env_name: str, version: str): """Verify that `info"z_distance_from_origin"` is correct.""" - env = gym.make(f"{env}-{version}") + env = gym.make(f"{env_name}-{version}") env.reset() _, _, _, _, info = env.step(env.action_space.sample()) mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) @@ -451,7 +451,7 @@ def test_z_distance_from_origin_info(env: str, version: str): @pytest.mark.parametrize( - "env", + "env_name", [ "Ant", "HalfCheetah", @@ -464,9 +464,9 @@ def test_z_distance_from_origin_info(env: str, version: str): ], ) @pytest.mark.parametrize("version", ["v5"]) -def test_observation_structure(env: str, version: str): +def test_observation_structure(env_name: str, version: str): """Verify that the `env.observation_structure` is properly defined.""" - env = gym.make(f"{env}-{version}") + env = gym.make(f"{env_name}-{version}") if hasattr(env, "observation_structure"): return @@ -491,7 +491,7 @@ def test_observation_structure(env: str, version: str): @pytest.mark.parametrize( - "env", + "env_name", [ "Ant", "HalfCheetah", @@ -507,9 +507,9 @@ def test_observation_structure(env: str, version: str): ], ) @pytest.mark.parametrize("version", ["v5"]) -def test_reset_info(env: str, version: str): +def test_reset_info(env_name: str, version: str): """Verify that the environment returns info with `reset()`.""" - env = gym.make(f"{env}-{version}") + env = gym.make(f"{env_name}-{version}") _, reset_info = env.reset() assert len(reset_info) > 0 From 925ffc76b4401e77c5f90a0030ec5e1abe6416ae Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sun, 9 Jul 2023 18:16:00 +0300 Subject: [PATCH 33/53] fix (most) `pyright.TypeIssues` --- tests/envs/mujoco/test_mujoco_v5.py | 350 +++++++++++++++------------- 1 file changed, 183 insertions(+), 167 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index 3fda2525b..58c5ba241 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -5,8 +5,10 @@ import pytest import gymnasium as gym +from gymnasium.envs.mujoco.mujoco_env import BaseMujocoEnv, MujocoEnv from gymnasium.error import Error from gymnasium.utils.env_match import check_environments_match +from gymnasium.utils.env_checker import check_env ALL_MUJOCO_ENVS = [ @@ -84,12 +86,13 @@ def test_verify_info_y_position(env_id: str): @pytest.mark.parametrize("version", ["v5", "v4", "v3"]) def test_verify_info_x_velocity(env_name: str, version: str): """Asserts that the environment `info['x_velocity']` is properly assigned.""" - env = gym.make(f"{env_name}-{version}") + env = gym.make(f"{env_name}-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() - old_x = env.unwrapped.data.qpos[0] + old_x = env.data.qpos[0] _, _, _, _, info = env.step(env.action_space.sample()) - new_x = env.unwrapped.data.qpos[0] + new_x = env.data.qpos[0] dx = new_x - old_x vel_x = dx / env.dt @@ -100,12 +103,13 @@ def test_verify_info_x_velocity(env_name: str, version: str): @pytest.mark.parametrize("env_id", ["Swimmer-v5", "Swimmer-v4", "Swimmer-v3"]) def test_verify_info_y_velocity(env_id: str): """Asserts that the environment `info['y_velocity']` is properly assigned.""" - env = gym.make(env_id) + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() - old_y = env.unwrapped.data.qpos[1] + old_y = env.data.qpos[1] _, _, _, _, info = env.step(env.action_space.sample()) - new_y = env.unwrapped.data.qpos[1] + new_y = env.data.qpos[1] dy = new_y - old_y vel_y = dy / env.dt @@ -115,7 +119,8 @@ def test_verify_info_y_velocity(env_id: str): @pytest.mark.parametrize("env_id", ["Ant-v5", "Ant-v4", "Ant-v3"]) def test_verify_info_xy_velocity_xpos(env_id: str): """Asserts that the environment `info['x/y_velocity']` is properly assigned, for the ant environment which uses kinmatics for the velocity.""" - env = gym.make(env_id) + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() old_xy = env.get_body_com("torso")[:2].copy() @@ -137,12 +142,13 @@ def mass_center(model, data): xpos = data.xipos return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() - env = gym.make(env_id) + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() - old_xy = mass_center(env.unwrapped.model, env.unwrapped.data) + old_xy = mass_center(env.model, env.data) _, _, _, _, info = env.step(env.action_space.sample()) - new_xy = mass_center(env.unwrapped.model, env.unwrapped.data) + new_xy = mass_center(env.model, env.data) dxy = new_xy - old_xy vel_x, vel_y = dxy / env.dt @@ -168,7 +174,8 @@ def mass_center(model, data): @pytest.mark.parametrize("version", ["v5"]) def test_verify_reward_survive(env_name: str, version: str): """Assert that `reward_survive` is 0 on `terminal` states and not 0 on non-`terminal` states.""" - env = gym.make(f"{env_name}-{version}", reset_noise_scale=0) + env = gym.make(f"{env_name}-{version}", reset_noise_scale=0).unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset(seed=0) env.action_space.seed(1) @@ -206,7 +213,7 @@ def test_frame_skip(env_name: str, version: str, frame_skip: int): # Test if env adheres to Gym API with warnings.catch_warnings(record=True) as w: - gym.utils.env_checker.check_env(env.unwrapped, skip_render_check=True) + check_env(env.unwrapped, skip_render_check=True) env.close() for warning in w: if warning.message.args[0] not in CHECK_ENV_IGNORE_WARNINGS: @@ -382,30 +389,30 @@ def test_identical_behaviour_v45(): @pytest.mark.parametrize("version", ["v5", "v4"]) def test_ant_com(version: str): """Verify the kinmatic behaviour of the ant.""" - env = gym.make( - f"Ant-{version}" - ) # `env` contains `data : MjData` and `model : MjModel` + # `env` contains `data : MjData` and `model : MjModel` + env = gym.make(f"Ant-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() # randomly initlizies the `data.qpos` and `data.qvel`, calls mujoco.mj_forward(env.model, env.data) - x_position_before = env.unwrapped.data.qpos[0] - x_position_before_com = env.unwrapped.data.body("torso").xpos[0] + x_position_before = env.data.qpos[0] + x_position_before_com = env.data.body("torso").xpos[0] assert x_position_before == x_position_before_com, "before failed" # This succeeds random_control = env.action_space.sample() - _, _, _, _, info = env.step( - random_control - ) # This calls mujoco.mj_step(env.model, env.data, nstep=env.frame_skip) - mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) + # This calls mujoco.mj_step(env.model, env.data, nstep=env.frame_skip) + _, _, _, _, info = env.step(random_control) + mujoco.mj_kinematics(env.model, env.data) - x_position_after = env.unwrapped.data.qpos[0] - x_position_after_com = env.unwrapped.data.body("torso").xpos[0] + x_position_after = env.data.qpos[0] + x_position_after_com = env.data.body("torso").xpos[0] assert x_position_after == x_position_after_com, "after failed" # This succeeds @pytest.mark.parametrize("version", ["v5", "v4", "v3", "v2"]) def test_set_state(version: str): """Simple Test to verify that `mujocoEnv.set_state()` works correctly.""" - env = gym.make(f"Hopper-{version}") + env = gym.make(f"Hopper-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() new_qpos = np.array( [0.00136962, 1.24769787, -0.00459026, -0.00483472, 0.0031327, 0.00412756] @@ -414,8 +421,8 @@ def test_set_state(version: str): [0.00106636, 0.00229497, 0.00043625, 0.00435072, 0.00315854, -0.00497261] ) env.set_state(new_qpos, new_qvel) - assert (env.unwrapped.data.qpos == new_qpos).all() - assert (env.unwrapped.data.qvel == new_qvel).all() + assert (env.data.qpos == new_qpos).all() + assert (env.data.qvel == new_qvel).all() # Note: HumanoidStandup-v4/v3 does not have `info` @@ -427,67 +434,61 @@ def test_set_state(version: str): ) def test_distance_from_origin_info(env_id: str): """Verify that `info"distance_from_origin"` is correct.""" - env = gym.make(env_id) + env = gym.make(env_id).unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() + _, _, _, _, info = env.step(env.action_space.sample()) assert info["distance_from_origin"] == np.linalg.norm( - env.unwrapped.data.qpos[0:2] - env.init_qpos[0:2] + env.data.qpos[0:2] - env.init_qpos[0:2] ) @pytest.mark.parametrize("env_name", ["Hopper", "HumanoidStandup", "Walker2d"]) @pytest.mark.parametrize("version", ["v5"]) def test_z_distance_from_origin_info(env_name: str, version: str): - """Verify that `info"z_distance_from_origin"` is correct.""" - env = gym.make(f"{env_name}-{version}") + """Verify that `info["z_distance_from_origin"]` is correct.""" + env = gym.make(f"{env_name}-{version}").unwrapped + assert isinstance(env, MujocoEnv) env.reset() + _, _, _, _, info = env.step(env.action_space.sample()) - mujoco.mj_kinematics(env.unwrapped.model, env.unwrapped.data) + mujoco.mj_kinematics(env.model, env.data) z_index = env.observation_structure["skipped_qpos"] assert ( info["z_distance_from_origin"] - == env.unwrapped.data.qpos[z_index] - env.init_qpos[z_index] + == env.data.qpos[z_index] - env.init_qpos[z_index] ) @pytest.mark.parametrize( "env_name", - [ - "Ant", - "HalfCheetah", - "Hopper", - "Humanoid", - "HumanoidStandup", - "InvertedPendulum", - "Swimmer", - "Walker2d", - ], + ALL_MUJOCO_ENVS ) @pytest.mark.parametrize("version", ["v5"]) def test_observation_structure(env_name: str, version: str): """Verify that the `env.observation_structure` is properly defined.""" - env = gym.make(f"{env_name}-{version}") - if hasattr(env, "observation_structure"): + env = gym.make(f"{env_name}-{version}").unwrapped + assert isinstance(env, MujocoEnv) + if not hasattr(env, "observation_structure"): return obs_struct = env.observation_structure - assert ( - env.unwrapped.model.nq == obs_struct.get("skipped_qpos", 0) + obs_struct["qpos"] - ) - assert env.unwrapped.model.nv == obs_struct["qvel"] + assert env.model.nq == obs_struct.get("skipped_qpos", 0) + obs_struct["qpos"] + assert env.model.nv == obs_struct["qvel"] if obs_struct.get("cinert", False): - assert (env.unwrapped.model.nbody - 1) * 10 == obs_struct["cinert"] + assert (env.model.nbody - 1) * 10 == obs_struct["cinert"] if obs_struct.get("cvel", False): - assert (env.unwrapped.model.nbody - 1) * 6 == obs_struct["cvel"] + assert (env.model.nbody - 1) * 6 == obs_struct["cvel"] if obs_struct.get("qfrc_actuator", False): - assert env.unwrapped.model.nv - 6 == obs_struct["qfrc_actuator"] + assert env.model.nv - 6 == obs_struct["qfrc_actuator"] if obs_struct.get("cfrc_ext", False): - assert (env.unwrapped.model.nbody - 1) * 6 == obs_struct["cfrc_ext"] + assert (env.model.nbody - 1) * 6 == obs_struct["cfrc_ext"] if obs_struct.get("ten_lenght", False): - assert env.unwrapped.model.ntendon == obs_struct["ten_lenght"] + assert env.model.ntendon == obs_struct["ten_lenght"] if obs_struct.get("ten_velocity", False): - assert env.unwrapped.model.ntendon == obs_struct["ten_velocity"] + assert env.model.ntendon == obs_struct["ten_velocity"] @pytest.mark.parametrize( @@ -518,18 +519,22 @@ def test_reset_info(env_name: str, version: str): @pytest.mark.parametrize("version", ["v5"]) def test_inverted_double_pendulum_max_height(version: str): """Verify the max height of Inverted Double Pendulum.""" - env = gym.make(f"InvertedDoublePendulum-{version}", reset_noise_scale=0) + env = gym.make(f"InvertedDoublePendulum-{version}", reset_noise_scale=0).unwrapped + assert isinstance(env, BaseMujocoEnv) env.reset() - y = env.unwrapped.data.site_xpos[0][2] + + y = env.data.site_xpos[0][2] assert y == 1.2 @pytest.mark.parametrize("version", ["v4"]) def test_inverted_double_pendulum_max_height_old(version: str): """Verify the max height of Inverted Double Pendulum (v4 does not have `reset_noise_scale` argument).""" - env = gym.make(f"InvertedDoublePendulum-{version}") + env = gym.make(f"InvertedDoublePendulum-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) env.set_state(env.init_qpos, env.init_qvel) - y = env.unwrapped.data.site_xpos[0][2] + + y = env.data.site_xpos[0][2] assert y == 1.2 @@ -537,115 +542,126 @@ def test_inverted_double_pendulum_max_height_old(version: str): @pytest.mark.parametrize("version", ["v5", "v4"]) def test_model_object_count(version: str): """Verify that all the objects of the model are loaded, mostly useful for using non-mujoco simulator.""" - env = gym.make(f"Ant-{version}") - assert env.unwrapped.model.nq == 15 - assert env.unwrapped.model.nv == 14 - assert env.unwrapped.model.nu == 8 - assert env.unwrapped.model.nbody == 14 - assert env.unwrapped.model.nbvh == 14 - assert env.unwrapped.model.njnt == 9 - assert env.unwrapped.model.ngeom == 14 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"HalfCheetah-{version}") - assert env.unwrapped.model.nq == 9 - assert env.unwrapped.model.nv == 9 - assert env.unwrapped.model.nu == 6 - assert env.unwrapped.model.nbody == 8 - assert env.unwrapped.model.nbvh == 10 - assert env.unwrapped.model.njnt == 9 - assert env.unwrapped.model.ngeom == 9 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"Hopper-{version}") - assert env.unwrapped.model.nq == 6 - assert env.unwrapped.model.nv == 6 - assert env.unwrapped.model.nu == 3 - assert env.unwrapped.model.nbody == 5 - assert env.unwrapped.model.nbvh == 5 - assert env.unwrapped.model.njnt == 6 - assert env.unwrapped.model.ngeom == 5 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"Humanoid-{version}") - assert env.unwrapped.model.nq == 24 - assert env.unwrapped.model.nv == 23 - assert env.unwrapped.model.nu == 17 - assert env.unwrapped.model.nbody == 14 - assert env.unwrapped.model.nbvh == 22 - assert env.unwrapped.model.njnt == 18 - assert env.unwrapped.model.ngeom == 18 - assert env.unwrapped.model.ntendon == 2 - - env = gym.make(f"HumanoidStandup-{version}") - assert env.unwrapped.model.nq == 24 - assert env.unwrapped.model.nv == 23 - assert env.unwrapped.model.nu == 17 - assert env.unwrapped.model.nbody == 14 - assert env.unwrapped.model.nbvh == 22 - assert env.unwrapped.model.njnt == 18 - assert env.unwrapped.model.ngeom == 18 - assert env.unwrapped.model.ntendon == 2 - - env = gym.make(f"InvertedDoublePendulum-{version}") - assert env.unwrapped.model.nq == 3 - assert env.unwrapped.model.nv == 3 - assert env.unwrapped.model.nu == 1 - assert env.unwrapped.model.nbody == 4 - assert env.unwrapped.model.nbvh == 6 - assert env.unwrapped.model.njnt == 3 - assert env.unwrapped.model.ngeom == 5 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"InvertedPendulum-{version}") - assert env.unwrapped.model.nq == 2 - assert env.unwrapped.model.nv == 2 - assert env.unwrapped.model.nu == 1 - assert env.unwrapped.model.nbody == 3 - assert env.unwrapped.model.nbvh == 3 - assert env.unwrapped.model.njnt == 2 - assert env.unwrapped.model.ngeom == 3 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"Pusher-{version}") - assert env.unwrapped.model.nq == 11 - assert env.unwrapped.model.nv == 11 - assert env.unwrapped.model.nu == 7 - assert env.unwrapped.model.nbody == 13 - assert env.unwrapped.model.nbvh == 18 - assert env.unwrapped.model.njnt == 11 - assert env.unwrapped.model.ngeom == 21 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"Reacher-{version}") - assert env.unwrapped.model.nq == 4 - assert env.unwrapped.model.nv == 4 - assert env.unwrapped.model.nu == 2 - assert env.unwrapped.model.nbody == 5 - assert env.unwrapped.model.nbvh == 5 - assert env.unwrapped.model.njnt == 4 - assert env.unwrapped.model.ngeom == 10 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"Swimmer-{version}") - assert env.unwrapped.model.nq == 5 - assert env.unwrapped.model.nv == 5 - assert env.unwrapped.model.nu == 2 - assert env.unwrapped.model.nbody == 4 - assert env.unwrapped.model.nbvh == 4 - assert env.unwrapped.model.njnt == 5 - assert env.unwrapped.model.ngeom == 4 - assert env.unwrapped.model.ntendon == 0 - - env = gym.make(f"Walker2d-{version}") - assert env.unwrapped.model.nq == 9 - assert env.unwrapped.model.nv == 9 - assert env.unwrapped.model.nu == 6 - assert env.unwrapped.model.nbody == 8 - assert env.unwrapped.model.nbvh == 8 - assert env.unwrapped.model.njnt == 9 - assert env.unwrapped.model.ngeom == 8 - assert env.unwrapped.model.ntendon == 0 + env = gym.make(f"Ant-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 15 + assert env.model.nv == 14 + assert env.model.nu == 8 + assert env.model.nbody == 14 + assert env.model.nbvh == 14 + assert env.model.njnt == 9 + assert env.model.ngeom == 14 + assert env.model.ntendon == 0 + + env = gym.make(f"HalfCheetah-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 9 + assert env.model.nv == 9 + assert env.model.nu == 6 + assert env.model.nbody == 8 + assert env.model.nbvh == 10 + assert env.model.njnt == 9 + assert env.model.ngeom == 9 + assert env.model.ntendon == 0 + + env = gym.make(f"Hopper-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 6 + assert env.model.nv == 6 + assert env.model.nu == 3 + assert env.model.nbody == 5 + assert env.model.nbvh == 5 + assert env.model.njnt == 6 + assert env.model.ngeom == 5 + assert env.model.ntendon == 0 + + env = gym.make(f"Humanoid-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 24 + assert env.model.nv == 23 + assert env.model.nu == 17 + assert env.model.nbody == 14 + assert env.model.nbvh == 22 + assert env.model.njnt == 18 + assert env.model.ngeom == 18 + assert env.model.ntendon == 2 + + env = gym.make(f"HumanoidStandup-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 24 + assert env.model.nv == 23 + assert env.model.nu == 17 + assert env.model.nbody == 14 + assert env.model.nbvh == 22 + assert env.model.njnt == 18 + assert env.model.ngeom == 18 + assert env.model.ntendon == 2 + + env = gym.make(f"InvertedDoublePendulum-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 3 + assert env.model.nv == 3 + assert env.model.nu == 1 + assert env.model.nbody == 4 + assert env.model.nbvh == 6 + assert env.model.njnt == 3 + assert env.model.ngeom == 5 + assert env.model.ntendon == 0 + + env = gym.make(f"InvertedPendulum-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 2 + assert env.model.nv == 2 + assert env.model.nu == 1 + assert env.model.nbody == 3 + assert env.model.nbvh == 3 + assert env.model.njnt == 2 + assert env.model.ngeom == 3 + assert env.model.ntendon == 0 + + env = gym.make(f"Pusher-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 11 + assert env.model.nv == 11 + assert env.model.nu == 7 + assert env.model.nbody == 13 + assert env.model.nbvh == 18 + assert env.model.njnt == 11 + assert env.model.ngeom == 21 + assert env.model.ntendon == 0 + + env = gym.make(f"Reacher-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 4 + assert env.model.nv == 4 + assert env.model.nu == 2 + assert env.model.nbody == 5 + assert env.model.nbvh == 5 + assert env.model.njnt == 4 + assert env.model.ngeom == 10 + assert env.model.ntendon == 0 + + env = gym.make(f"Swimmer-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 5 + assert env.model.nv == 5 + assert env.model.nu == 2 + assert env.model.nbody == 4 + assert env.model.nbvh == 4 + assert env.model.njnt == 5 + assert env.model.ngeom == 4 + assert env.model.ntendon == 0 + + env = gym.make(f"Walker2d-{version}").unwrapped + assert isinstance(env, BaseMujocoEnv) + assert env.model.nq == 9 + assert env.model.nv == 9 + assert env.model.nu == 6 + assert env.model.nbody == 8 + assert env.model.nbvh == 8 + assert env.model.njnt == 9 + assert env.model.ngeom == 8 + assert env.model.ntendon == 0 """ From fd3a4fc2bd4384be44df7946c740df2ae4c659e8 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sun, 9 Jul 2023 18:21:53 +0300 Subject: [PATCH 34/53] `pre-commit` --- tests/envs/mujoco/test_mujoco_v5.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index 58c5ba241..acb6bc23d 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -7,8 +7,8 @@ import gymnasium as gym from gymnasium.envs.mujoco.mujoco_env import BaseMujocoEnv, MujocoEnv from gymnasium.error import Error -from gymnasium.utils.env_match import check_environments_match from gymnasium.utils.env_checker import check_env +from gymnasium.utils.env_match import check_environments_match ALL_MUJOCO_ENVS = [ @@ -461,10 +461,7 @@ def test_z_distance_from_origin_info(env_name: str, version: str): ) -@pytest.mark.parametrize( - "env_name", - ALL_MUJOCO_ENVS -) +@pytest.mark.parametrize("env_name", ALL_MUJOCO_ENVS) @pytest.mark.parametrize("version", ["v5"]) def test_observation_structure(env_name: str, version: str): """Verify that the `env.observation_structure` is properly defined.""" From 0192681430ef3873d23b3047ea34f51a6c46d1e8 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sun, 9 Jul 2023 18:33:19 +0300 Subject: [PATCH 35/53] cleanup --- gymnasium/envs/mujoco/hopper_v5.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index aab4c5236..1cad148b1 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -317,14 +317,6 @@ def reset_model(self): observation = self._get_obs() return observation - def viewer_setup(self): - assert self.viewer is not None - for key, value in DEFAULT_CAMERA_CONFIG.items(): - if isinstance(value, np.ndarray): - getattr(self.viewer.cam, key)[:] = value - else: - setattr(self.viewer.cam, key, value) - def _get_reset_info(self): return { "x_position": self.data.qpos[0], From bd719e7707536e05d0d6b01bd47fb4054a30ad85 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sun, 9 Jul 2023 19:07:04 +0300 Subject: [PATCH 36/53] `hopper` revert change to model --- gymnasium/envs/mujoco/assets/hopper_v5.xml | 53 ---------------------- gymnasium/envs/mujoco/hopper_v5.py | 4 +- 2 files changed, 2 insertions(+), 55 deletions(-) delete mode 100644 gymnasium/envs/mujoco/assets/hopper_v5.xml diff --git a/gymnasium/envs/mujoco/assets/hopper_v5.xml b/gymnasium/envs/mujoco/assets/hopper_v5.xml deleted file mode 100644 index cc3fbc849..000000000 --- a/gymnasium/envs/mujoco/assets/hopper_v5.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - - - - diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 1cad148b1..991f1a720 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -120,7 +120,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): | Parameter | Type | Default | Description | | -------------------------------------------- | --------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | `xml_file` | **str** | `"hopper_v5.xml"` | Path to a MuJoCo model | + | `xml_file` | **str** | `"hopper.xml"` | Path to a MuJoCo model | | `forward_reward_weight` | **float** | `1` | Weight for _forward_reward_ term (see section on reward) | | `ctrl_cost_weight` | **float** | `1e-3` | Weight for _ctrl_cost_ reward (see section on reward) | | `healthy_reward` | **float** | `1` | Weight for _healthy_reward_ reward (see section on reward) | @@ -149,7 +149,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="hopper_v5.xml", + xml_file="hopper.xml", frame_skip=4, default_camera_config=DEFAULT_CAMERA_CONFIG, forward_reward_weight=1.0, From 5b6efe895e0cd665741cae251525fda230e6a689 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 10 Jul 2023 14:35:00 +0300 Subject: [PATCH 37/53] add `test_dt` --- gymnasium/envs/mujoco/mujoco_env.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/gymnasium/envs/mujoco/mujoco_env.py b/gymnasium/envs/mujoco/mujoco_env.py index f1759da3d..82f7853b3 100644 --- a/gymnasium/envs/mujoco/mujoco_env.py +++ b/gymnasium/envs/mujoco/mujoco_env.py @@ -1,5 +1,5 @@ from os import path -from typing import Dict, Optional, Union +from typing import Dict, Optional, Tuple, Union import numpy as np @@ -67,7 +67,8 @@ def __init__( self.width = width self.height = height - self._initialize_simulation() # may use width and height + # may use width and height + self.model, self.data = self._initialize_simulation() self.init_qpos = self.data.qpos.ravel().copy() self.init_qvel = self.data.qvel.ravel().copy() @@ -229,9 +230,10 @@ def __init__( ) def _initialize_simulation(self): - self.model = mujoco_py.load_model_from_path(self.fullpath) - self.sim = mujoco_py.MjSim(self.model) - self.data = self.sim.data + model = mujoco_py.load_model_from_path(self.fullpath) + self.sim = mujoco_py.MjSim(model) + data = self.sim.data + return model, data def _reset_simulation(self): self.sim.reset() @@ -367,16 +369,22 @@ def __init__( from gymnasium.envs.mujoco.mujoco_rendering import MujocoRenderer + # An Optional member indicating the composion of the observation space + self.observation_structure: Dict + self.mujoco_renderer = MujocoRenderer( self.model, self.data, default_camera_config ) - def _initialize_simulation(self): - self.model = mujoco.MjModel.from_xml_path(self.fullpath) + def _initialize_simulation( + self, + ) -> Tuple[mujoco._structs.MjModel, mujoco._structs.MjData]: + model = mujoco.MjModel.from_xml_path(self.fullpath) # MjrContext will copy model.vis.global_.off* to con.off* - self.model.vis.global_.offwidth = self.width - self.model.vis.global_.offheight = self.height - self.data = mujoco.MjData(self.model) + model.vis.global_.offwidth = self.width + model.vis.global_.offheight = self.height + data = mujoco.MjData(model) + return model, data def _reset_simulation(self): mujoco.mj_resetData(self.model, self.data) From 0a74e969cbe3455dc2cd730ce186cd83bf852464 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 10 Jul 2023 14:35:37 +0300 Subject: [PATCH 38/53] cleanup --- tests/envs/mujoco/test_mujoco_v5.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index acb6bc23d..aa4f38a6f 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -179,6 +179,7 @@ def test_verify_reward_survive(env_name: str, version: str): env.reset(seed=0) env.action_space.seed(1) + terminal = False for step in range(80): obs, rew, terminal, truncated, info = env.step(env.action_space.sample()) @@ -661,12 +662,11 @@ def test_model_object_count(version: str): assert env.model.ntendon == 0 -""" def test_dt(): + """Assert that env.dt gets assigned correctly.""" env_a = gym.make("Ant-v5", include_cfrc_ext_in_observation=False) env_b = gym.make("Ant-v5", include_cfrc_ext_in_observation=False, frame_skip=1) env_b.unwrapped.model.opt.timestep = 0.05 assert env_a.dt == env_b.dt - check_environments_match(env_a, env_b, num_steps=100) -""" + # check_environments_match(env_a, env_b, num_steps=100) # This Fails as expected From c103054f709707a1e78239f779ff5909a644dfab Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas <30759571+Kallinteris-Andreas@users.noreply.github.com> Date: Mon, 10 Jul 2023 15:24:07 +0300 Subject: [PATCH 39/53] Add version to CITATION --- CITATION.cff | 1 + 1 file changed, 1 insertion(+) diff --git a/CITATION.cff b/CITATION.cff index 204cb1651..c60bf977a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,5 +1,6 @@ cff-version: 1.2.0 title: Gymnasium +version: 0.28.1 message: >- If you use this software, please cite it using the metadata from this file. From 021ce5188494b1b26c2f69a23d9b129f52895c23 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 10 Jul 2023 15:24:53 +0300 Subject: [PATCH 40/53] undo --- CITATION.cff | 1 - 1 file changed, 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index c60bf977a..204cb1651 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,6 +1,5 @@ cff-version: 1.2.0 title: Gymnasium -version: 0.28.1 message: >- If you use this software, please cite it using the metadata from this file. From 2dff912175fd5f5ee17973e9c74d70b96ce8a183 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 10 Jul 2023 16:30:18 +0300 Subject: [PATCH 41/53] undo --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 918788f0c..36c4f3065 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,7 +145,7 @@ reportInvalidTypeVarUse = "none" # reportUnknownVariableType = "warning" # -> raises 2585 warnings # reportUnknownArgumentType = "warning" # -> raises 2104 warnings reportGeneralTypeIssues = "none" # -> commented out raises 489 errors -# reportUntypedFunctionDecorator = "none" # -> pytest.mark.parameterize issues +reportUntypedFunctionDecorator = "none" # -> pytest.mark.parameterize issues reportPrivateUsage = "warning" reportUnboundVariable = "warning" From 04661001f5852a23e6d0c3eef158b469cf78b3dd Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 10 Jul 2023 16:35:09 +0300 Subject: [PATCH 42/53] undo --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 36c4f3065..918788f0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,7 +145,7 @@ reportInvalidTypeVarUse = "none" # reportUnknownVariableType = "warning" # -> raises 2585 warnings # reportUnknownArgumentType = "warning" # -> raises 2104 warnings reportGeneralTypeIssues = "none" # -> commented out raises 489 errors -reportUntypedFunctionDecorator = "none" # -> pytest.mark.parameterize issues +# reportUntypedFunctionDecorator = "none" # -> pytest.mark.parameterize issues reportPrivateUsage = "warning" reportUnboundVariable = "warning" From 7e8a4d637db7decc40a2e4f4304d816889f81487 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 10 Jul 2023 16:41:14 +0300 Subject: [PATCH 43/53] fix test_dt typing --- tests/envs/mujoco/test_mujoco_v5.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index aa4f38a6f..dfaef1008 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -664,9 +664,13 @@ def test_model_object_count(version: str): def test_dt(): """Assert that env.dt gets assigned correctly.""" - env_a = gym.make("Ant-v5", include_cfrc_ext_in_observation=False) - env_b = gym.make("Ant-v5", include_cfrc_ext_in_observation=False, frame_skip=1) - env_b.unwrapped.model.opt.timestep = 0.05 + env_a = gym.make("Ant-v5", include_cfrc_ext_in_observation=False).unwrapped + env_b = gym.make( + "Ant-v5", include_cfrc_ext_in_observation=False, frame_skip=1 + ).unwrapped + assert isinstance(env_a, BaseMujocoEnv) + assert isinstance(env_b, BaseMujocoEnv) + env_b.model.opt.timestep = 0.05 assert env_a.dt == env_b.dt # check_environments_match(env_a, env_b, num_steps=100) # This Fails as expected From ee66207feeadb7c9e67869ae5317b7aed63f71b8 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 10 Jul 2023 18:12:43 +0300 Subject: [PATCH 44/53] undo mujoco env changes --- gymnasium/envs/mujoco/mujoco_env.py | 30 +++++++++++------------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/gymnasium/envs/mujoco/mujoco_env.py b/gymnasium/envs/mujoco/mujoco_env.py index 82f7853b3..c3490bd1f 100644 --- a/gymnasium/envs/mujoco/mujoco_env.py +++ b/gymnasium/envs/mujoco/mujoco_env.py @@ -1,5 +1,5 @@ from os import path -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Union import numpy as np @@ -67,8 +67,7 @@ def __init__( self.width = width self.height = height - # may use width and height - self.model, self.data = self._initialize_simulation() + self._initialize_simulation() # may use width and height self.init_qpos = self.data.qpos.ravel().copy() self.init_qvel = self.data.qvel.ravel().copy() @@ -210,7 +209,7 @@ def __init__( logger.deprecation( "This version of the mujoco environments depends " "on the mujoco-py bindings, which are no longer maintained " - "and may stop working. Please upgrade to the v5 or v4 versions of " + "and may stop working. Please upgrade to the v4 versions of " "the environments (which depend on the mujoco python bindings instead), unless " "you are trying to precisely replicate previous works)." ) @@ -230,10 +229,9 @@ def __init__( ) def _initialize_simulation(self): - model = mujoco_py.load_model_from_path(self.fullpath) - self.sim = mujoco_py.MjSim(model) - data = self.sim.data - return model, data + self.model = mujoco_py.load_model_from_path(self.fullpath) + self.sim = mujoco_py.MjSim(self.model) + self.data = self.sim.data def _reset_simulation(self): self.sim.reset() @@ -369,22 +367,16 @@ def __init__( from gymnasium.envs.mujoco.mujoco_rendering import MujocoRenderer - # An Optional member indicating the composion of the observation space - self.observation_structure: Dict - self.mujoco_renderer = MujocoRenderer( self.model, self.data, default_camera_config ) - def _initialize_simulation( - self, - ) -> Tuple[mujoco._structs.MjModel, mujoco._structs.MjData]: - model = mujoco.MjModel.from_xml_path(self.fullpath) + def _initialize_simulation(self): + self.model = mujoco.MjModel.from_xml_path(self.fullpath) # MjrContext will copy model.vis.global_.off* to con.off* - model.vis.global_.offwidth = self.width - model.vis.global_.offheight = self.height - data = mujoco.MjData(model) - return model, data + self.model.vis.global_.offwidth = self.width + self.model.vis.global_.offheight = self.height + self.data = mujoco.MjData(self.model) def _reset_simulation(self): mujoco.mj_resetData(self.model, self.data) From a5da62b883eee20c5ce2b54be763cfca03873fcf Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 11 Jul 2023 02:10:33 +0300 Subject: [PATCH 45/53] a --- gymnasium/envs/mujoco/mujoco_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gymnasium/envs/mujoco/mujoco_env.py b/gymnasium/envs/mujoco/mujoco_env.py index 7018daa1c..099354a08 100644 --- a/gymnasium/envs/mujoco/mujoco_env.py +++ b/gymnasium/envs/mujoco/mujoco_env.py @@ -415,4 +415,4 @@ def close(self): self.mujoco_renderer.close() def get_body_com(self, body_name): - return self.data.body(body_name).xpos + return self.data.body(body_name).xpos \ No newline at end of file From f6fba956a36a22fa919ffe46b2d8dc2b8b311f92 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 11 Jul 2023 02:19:26 +0300 Subject: [PATCH 46/53] `pre-commit` --- gymnasium/envs/mujoco/mujoco_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gymnasium/envs/mujoco/mujoco_env.py b/gymnasium/envs/mujoco/mujoco_env.py index 099354a08..7018daa1c 100644 --- a/gymnasium/envs/mujoco/mujoco_env.py +++ b/gymnasium/envs/mujoco/mujoco_env.py @@ -415,4 +415,4 @@ def close(self): self.mujoco_renderer.close() def get_body_com(self, body_name): - return self.data.body(body_name).xpos \ No newline at end of file + return self.data.body(body_name).xpos From eaf46647a913fcc3f999c3cf64f367b62c131602 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Fri, 14 Jul 2023 19:05:24 +0300 Subject: [PATCH 47/53] @pseudo-rnd-thoughts 2nd review comments --- gymnasium/envs/mujoco/ant_v5.py | 105 ++++--- gymnasium/envs/mujoco/half_cheetah_v5.py | 68 ++-- gymnasium/envs/mujoco/hopper_v5.py | 74 +++-- gymnasium/envs/mujoco/humanoid_v5.py | 296 +++++++++--------- gymnasium/envs/mujoco/humanoidstandup_v5.py | 72 +++-- .../mujoco/inverted_double_pendulum_v5.py | 41 ++- gymnasium/envs/mujoco/inverted_pendulum_v5.py | 30 +- gymnasium/envs/mujoco/pusher_v5.py | 70 +++-- gymnasium/envs/mujoco/reacher_v5.py | 61 ++-- gymnasium/envs/mujoco/swimmer_v5.py | 51 +-- gymnasium/envs/mujoco/walker2d_v5.py | 74 +++-- 11 files changed, 548 insertions(+), 394 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 6a2c420c1..18d7de83c 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -26,11 +26,21 @@ class AntEnv(MujocoEnv, utils.EzPickle): torques on the eight hinges connecting the two body parts of each leg and the torso (nine body parts and eight hinges). + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Ant-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Ant-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Ant-v3 | `mujoco-py` | Maintained for reproducibility | + | Ant-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". ## Action Space The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints. - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | | 0 | Torque applied on the rotor between the torso and back right hip | -1 | 1 | hip_4 (right_back_leg) | hinge | torque (N m) | | 1 | Torque applied on the rotor between the back right two links | -1 | 1 | angle_4 (right_back_leg) | hinge | torque (N m) | @@ -43,20 +53,23 @@ class AntEnv(MujocoEnv, utils.EzPickle): ## Observation Space - Observations consist of positional values of different body parts of the ant, - followed by the velocities of those individual parts (their derivatives) with all - the positions ordered before all the velocities. + The observation Space consists of the following parts (in order): - By default, observations do not include the x- and y-coordinates of the ant's torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (107,), float64)` where the first two observations - represent the x- and y- coordinates of the ant's torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + - qpos:* Position values of the robots's body parts. + - qvel:* The velocities of these individual body parts, + (their derivatives). + - *cfrc_ext:* This is the center of mass based external forces on the body parts. + It has shape 13 * 6 (*nbody * 6*) and hence adds to another 78 elements in the state space. + (external forces - force x, y, z and torque x, y, z) - However, by default, observation Space is a `Box(-Inf, Inf, (105,), float64)` where the elements correspond to the following: + By default, the observation does not include the x- and y-coordinates of the torso. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (107,), float64)`, where the first two observations are the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + By default, however, the observation space is a `Box(-Inf, Inf, (105,), float64)`, where the position and velocity elements are as follows: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | @@ -88,44 +101,39 @@ class AntEnv(MujocoEnv, utils.EzPickle): | excluded | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | | excluded | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - - Additionally, after all the positional and velocity based values in the table, the observation contains: - - *cfrc_ext:* 13*6 = 78 elements, which are contact forces - (external forces - force x, y, z and torque x, y, z) applied to the - center of mass of each of the body parts. - The body parts are: - | id (for `v2`, `v3`, `v4)` | id (for `v5`) | body part | - | ---| --- | ------------ | - | 0 |excluded| worldbody (note: all values are constant 0) | - | 1 |0 | torso | - | 2 |1 | front_left_leg | - | 3 |2 | aux_1 (front left leg) | - | 4 |3 | ankle_1 (front left leg) | - | 5 |4 | front_right_leg | - | 6 |5 | aux_2 (front right leg) | - | 7 |6 | ankle_2 (front right leg) | - | 8 |7 | back_leg (back left leg) | - | 9 |8 | aux_3 (back left leg) | - | 10 |9 | ankle_3 (back left leg) | - | 11 |10 | right_back_leg | - | 12 |11 | aux_4 (back right leg) | - | 13 |12 | ankle_4 (back right leg) | + | body part | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ----------------------- | --- | --- | + | worldbody (note: all values are constant 0) | 0 |excluded| + | torso | 1 |0 | + | front_left_leg | 2 |1 | + | aux_1 (front left leg) | 3 |2 | + | ankle_1 (front left leg) | 4 |3 | + | front_right_leg | 5 |4 | + | aux_2 (front right leg) | 6 |5 | + | ankle_2 (front right leg) | 7 |6 | + | back_leg (back left leg) | 8 |7 | + | aux_3 (back left leg) | 9 |8 | + | ankle_3 (back left leg) | 10 |9 | + | right_back_leg | 11 |10 | + | aux_4 (back right leg) | 12 |11 | + | ankle_4 (back right leg) | 13 |12 | The (x,y,z) coordinates are translational DOFs while the orientations are rotational DOFs expressed as quaternions. One can read more about free joints in the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - **Note:** Ant-v4+ environment no longer has the following contact forces issue. - If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results + **Note:** + When using Ant-v3 or earlier versions, problems have been reported when using a Mujoco-Py version > 2.0 results in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 when using the Ant environment if you would like to report results with contact forces (if contact forces are not used in your experiments, you can use version > 2.0). ## Rewards - The reward consists of four parts: + The total reward is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. + - *healthy_reward*: Every timestep that the Ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`. @@ -147,26 +155,24 @@ class AntEnv(MujocoEnv, utils.EzPickle): $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-4}$), $F_{contact}$ are the external contact forces clipped by `contact_force_range` (see `cfrc_ext` section on observation). - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. But if `use_contact_forces=false` on `v4` The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. ## Starting State - All observations start in state - (0.0, 0.0, 0.75, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise - with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for - stochasticity. Note that the initial z coordinate is intentionally selected - to be slightly high, thereby indicating a standing up ant. The initial orientation - is designed to make it face forward as well. + The initial position state is $[0.0, 0.0, 0.75, 1.0, 0.0, ... 0.0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{15}, reset\_noise\_scale \times 1_{15}]}$. + The initial velocity state is $\mathcal{N}(0_{14}, reset\_noise\_scale^2 \times I_{14})$. + + where $\mathcal{N}$ is the multivariate normal distribution and $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z- and x-coordinates are non-zero so that the ant can immediately stand up and face forward (x-axis). ## Episode End #### Termination - If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Ant is unhealthy. + If `terminate_when_unhealthy is True` (the default), the environment terminates when the Ant is unhealthy. the Ant is unhealthy if any of the following happens: 1. Any of the state space values is no longer finite @@ -177,7 +183,8 @@ class AntEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + Ant provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -220,7 +227,7 @@ def __init__( self, xml_file: str = "ant.xml", frame_skip: int = 5, - default_camera_config: Dict = DEFAULT_CAMERA_CONFIG, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, forward_reward_weight: float = 1, ctrl_cost_weight: float = 0.5, contact_cost_weight: float = 5e-4, diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py index a9166492a..2738b459c 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v5.py +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -1,5 +1,7 @@ __credits__ = ["Kallinteris-Andreas", "Rushiv Arora"] +from typing import Dict + import numpy as np from gymnasium import utils @@ -26,11 +28,22 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): over the front and back thighs (connecting to the torso), shins (connecting to the thighs) and feet (connecting to the shins). + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | HalfCheetah-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | HalfCheetah-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | HalfCheetah-v3 | `mujoco-py` | Maintained for reproducibility | + | HalfCheetah-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + ## Action Space The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | --------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | | 0 | Torque applied on the back thigh rotor | -1 | 1 | bthigh | hinge | torque (N m) | | 1 | Torque applied on the back shin rotor | -1 | 1 | bshin | hinge | torque (N m) | @@ -41,19 +54,20 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): ## Observation Space - Observations consist of positional values of different body parts of the - cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + The observation Space consists of the following parts (in order): - By default, observations do not include the cheetah's `rootx`. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (18,), float64)` where the first element - represents the `rootx`. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the - will be returned in `info` with key `"x_position"`. + - qpos:* Position values of the robots's body parts. + - qvel:* The velocities of these individual body parts, + (their derivatives). + + By default, the observation does not include the robot's x-coordinate (`rootx`). + This can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (18,), float64)`, where the first observation element is the x--coordinate of the robot. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. However, by default, the observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | | 0 | z-coordinate of the front tip | -Inf | Inf | rootz | slide | position (m) | | 1 | angle of the front tip | -Inf | Inf | rooty | hinge | angle (rad) | @@ -76,7 +90,8 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of two parts: + The total reward is: ***reward*** *=* *forward_reward - ctrl_cost*. + - *forward_reward*: A reward for moving forward, this reward would be positive if the Half Cheetah moves forward (in the positive $x$ direction / in the right direction). @@ -90,18 +105,14 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is the `ctrl_cost_weight` (default is $0.1$). - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost*, - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. ## Starting State - All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the - initial state for stochasticity. As seen before, the first 8 values in the - state are positional and the last 9 values are velocity. A uniform noise in - the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard - normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the - initial velocity values of all zeros. + The initial position state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{9}, reset\_noise\_scale \times 1_{9}]}$. + The initial velocity state is $\mathcal{N}(0_{9}, reset\_noise\_scale^2 \times I_{9})$. + + where $\mathcal{N}$ is the multivariate normal distribution and $\mathcal{U}$ is the multivariate uniform continuous distribution. ## Episode End @@ -113,7 +124,8 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + HalfCheetah provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -146,13 +158,13 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="half_cheetah.xml", - frame_skip=5, - default_camera_config=DEFAULT_CAMERA_CONFIG, - forward_reward_weight=1.0, - ctrl_cost_weight=0.1, - reset_noise_scale=0.1, - exclude_current_positions_from_observation=True, + xml_file: str = "half_cheetah.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 0.1, + reset_noise_scale: float = 0.1, + exclude_current_positions_from_observation: bool = True, **kwargs, ): utils.EzPickle.__init__( diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 991f1a720..2341e751a 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -1,5 +1,7 @@ __credits__ = ["Kallinteris-Andreas"] +from typing import Dict, Tuple + import numpy as np from gymnasium import utils @@ -28,11 +30,21 @@ class HopperEnv(MujocoEnv, utils.EzPickle): forward (right) direction by applying torques on the three hinges connecting the four body parts. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Hopper-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Hopper-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Hopper-v3 | `mujoco-py` | Maintained for reproducibility | + | Hopper-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". ## Action Space The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied at the hinge joints. - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | @@ -40,9 +52,11 @@ class HopperEnv(MujocoEnv, utils.EzPickle): ## Observation Space - Observations consist of positional values of different body parts of the - hopper, followed by the velocities of those individual parts - (their derivatives) with all the positions ordered before all the velocities. + The observation Space consists of the following parts (in order): + + - qpos:* Position values of the robots's body parts. + - qvel:* The velocities of these individual body parts, + (their derivatives). By default, observations do not include the x-coordinate of the hopper. It may be included by passing `exclude_current_positions_from_observation=False` during construction. @@ -51,10 +65,15 @@ class HopperEnv(MujocoEnv, utils.EzPickle): Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate will be returned in `info` with key `"x_position"`. + By default, the observation does not include the robot's x-coordinate (`rootx`). + This can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (12,), float64)`, where the first observation element is the x--coordinate of the robot. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | | 0 | z-coordinate of the torso (height of hopper) | -Inf | Inf | rootz | slide | position (m) | | 1 | angle of the torso | -Inf | Inf | rooty | hinge | angle (rad) | @@ -71,7 +90,8 @@ class HopperEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of three parts: + The total reward is: ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*. + - *healthy_reward*: Every timestep that the Hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`. @@ -88,18 +108,21 @@ class HopperEnv(MujocoEnv, utils.EzPickle): $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*, - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. + ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise - in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. + The initial position state is $[0, 1.25, 0, 0, 0, 0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{6}, reset\_noise\_scale \times 1_{6}]}$. + The initial velocity state is $0_6 + \mathcal{U}_{[-reset\_noise\_scale \times 1_{6}, reset\_noise\_scale \times 1_{6}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z-coordinate is non-zero so that the hopper can stand up immediately. ## Episode End #### Termination - If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Hopper is unhealthy. + If `terminate_when_unhealthy is True` (the default), the environment terminates when the Hopper is unhealthy. The Hopper is unhealthy if any of the following happens: 1. An element of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range` @@ -111,7 +134,8 @@ class HopperEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + Hopper provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -149,18 +173,18 @@ class HopperEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="hopper.xml", - frame_skip=4, - default_camera_config=DEFAULT_CAMERA_CONFIG, - forward_reward_weight=1.0, - ctrl_cost_weight=1e-3, - healthy_reward=1.0, - terminate_when_unhealthy=True, - healthy_state_range=(-100.0, 100.0), - healthy_z_range=(0.7, float("inf")), - healthy_angle_range=(-0.2, 0.2), - reset_noise_scale=5e-3, - exclude_current_positions_from_observation=True, + xml_file: str = "hopper.xml", + frame_skip: int = 4, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-3, + healthy_reward: float = 1.0, + terminate_when_unhealthy: bool = True, + healthy_state_range: Tuple[float, float] = (-100.0, 100.0), + healthy_z_range: Tuple[float, float] = (0.7, float("inf")), + healthy_angle_range: Tuple[float, float] = (-0.2, 0.2), + reset_noise_scale: float = 5e-3, + exclude_current_positions_from_observation: bool = True, **kwargs, ): utils.EzPickle.__init__( diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index f3548c747..26bb5743d 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -1,5 +1,7 @@ __credits__ = ["Kallinteris-Andreas"] +from typing import Dict, Tuple + import numpy as np from gymnasium import utils @@ -31,45 +33,72 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over. - ## Action Space - The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. + Gymnasium includes the following versions of the environment: - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|----------------------|---------------|----------------|---------------------------------------|-------|------| - | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | - | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | - | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | - | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | - | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | - | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | - | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | - | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | - | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | - | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | - | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | - | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | - | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | - | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | - | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | - | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | - | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Humanoid-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Humanoid-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Humanoid-v3 | `mujoco-py` | Maintained for reproducibility | + | Humanoid-v2 | `mujoco-py` | Maintained for reproducibility | + For more information see section "Version History". - ## Observation Space - Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - By default, observations do not include the x- and y-coordinates of the torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (350,), float64)` where the first two observations - represent the x- and y-coordinates of the torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + ## Action Space + The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. + + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | + | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | + | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | + | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | + | 2 | Torque applied on the hinge in the x-coordinate of the abdomen | -0.4 | 0.4 | abdomen_x | hinge | torque (N m) | + | 3 | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh) | hinge | torque (N m) | + | 4 | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh) | hinge | torque (N m) | + | 5 | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh) | hinge | torque (N m) | + | 6 | Torque applied on the rotor between the right hip/thigh and the right shin | -0.4 | 0.4 | right_knee | hinge | torque (N m) | + | 7 | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate) | -0.4 | 0.4 | left_hip_x (left_thigh) | hinge | torque (N m) | + | 8 | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate) | -0.4 | 0.4 | left_hip_z (left_thigh) | hinge | torque (N m) | + | 9 | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate) | -0.4 | 0.4 | left_hip_y (left_thigh) | hinge | torque (N m) | + | 10 | Torque applied on the rotor between the left hip/thigh and the left shin | -0.4 | 0.4 | left_knee | hinge | torque (N m) | + | 11 | Torque applied on the rotor between the torso and right upper arm (coordinate -1) | -0.4 | 0.4 | right_shoulder1 | hinge | torque (N m) | + | 12 | Torque applied on the rotor between the torso and right upper arm (coordinate -2) | -0.4 | 0.4 | right_shoulder2 | hinge | torque (N m) | + | 13 | Torque applied on the rotor between the right upper arm and right lower arm | -0.4 | 0.4 | right_elbow | hinge | torque (N m) | + | 14 | Torque applied on the rotor between the torso and left upper arm (coordinate -1) | -0.4 | 0.4 | left_shoulder1 | hinge | torque (N m) | + | 15 | Torque applied on the rotor between the torso and left upper arm (coordinate -2) | -0.4 | 0.4 | left_shoulder2 | hinge | torque (N m) | + | 16 | Torque applied on the rotor between the left upper arm and left lower arm | -0.4 | 0.4 | left_elbow | hinge | torque (N m) | - However, by default, the observation is a `Box(-Inf, Inf, (348,), float64)`. The elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + ## Observation Space + The observation space consists of the following parts (in order) + + - qpos:* The position values of the robot's body parts. + - qvel:* The velocities of these individual body parts, + (their derivatives). + - *cinert:* Mass and inertia of the rigid body parts relative to the center of mass, + (this is an intermediate result of the transition). + It has shape 13*10 (*nbody * 10*) and thus adds another 130 elements to the observation space. + (cinert - inertia matrix and body mass offset and body mass) + - *cvel:* Center of mass based velocity. + It has shape 13 * 6 (*nbody * 6*) and thus adds another 78 elements to the observation space. + (com velocity - velocity x, y, z and angular velocity x, y, z) + - *qfrc_actuator:* Constraint force generated as the actuator force at each joint. + This has shape `(17,)` *(nv * 1)* and thus adds another 17 elements to the observation space. + - *cfrc_ext:* This is the center of mass based external force on the body parts. + It has shape 13 * 6 (*nbody * 6*) and thus adds to another 78 elements in the observation space. + (external forces - force x, y, z and torque x, y, z) + + where *nbody* is for the number of bodies in the robot + and *nv* is for the number of degrees of freedom (*= dim(qvel)*). + + By default, the observation does not include the x- and y-coordinates of the torso. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (350,), float64)`, where the first two observations are the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, however, the observation space is a `Box(-Inf, Inf, (348,), float64)`, where the position and velocity elements are as follows: + + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | @@ -96,96 +125,82 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angular velocity (rad/s) | + | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angular velocity (rad/s) | + | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angular velocity (rad/s) | + | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angular velocity (rad/s) | + | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angular velocity (rad/s) | + | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angular velocity (rad/s) | + | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angular velocity (rad/s) | + | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angular velocity (rad/s) | + | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angular velocity (rad/s) | + | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angular velocity (rad/s) | + | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angular velocity (rad/s) | + | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angular velocity (rad/s) | + | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angular velocity (rad/s) | + | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angular velocity (rad/s) | + | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angular velocity (rad/s) | + | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angular velocity (rad/s) | + | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angular velocity (rad/s) | | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - Additionally, after all the positional and velocity based values in the table, - the observation contains (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 13*10 (*nbody * 10*) - and hence adds to another 130 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 13 * 6 (*nbody * 6*) and hence - adds another 78 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(17,)` *(nv * 1)* and hence adds another 17 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 13 * 6 (*nbody * 6*) and hence adds to another 78 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the - number of degrees of freedom (*= dim(qvel)*) - The body parts are: - | id (for `v2`, `v3`, `v4)` | id (for `v5`) | body part | - | ---| --- | ------------ | - | 0 |excluded| worldbody (note: all values are constant 0) | - | 1 | 0 | torso | - | 2 | 1 | lwaist | - | 3 | 2 | pelvis | - | 4 | 3 | right_thigh | - | 5 | 4 | right_sin | - | 6 | 5 | right_foot | - | 7 | 6 | left_thigh | - | 8 | 7 | left_sin | - | 9 | 8 | left_foot | - | 10 | 9 | right_upper_arm | - | 11 | 10 | right_lower_arm | - | 12 | 11 | left_upper_arm | - | 13 | 12 | left_lower_arm | + | body part | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | worldbody (note: all values are constant 0) | 0 |excluded| + | torso |1 | 0 | + | lwaist |2 | 1 | + | pelvis |3 | 2 | + | right_thigh |4 | 3 | + | right_sin |5 | 4 | + | right_foot |6 | 5 | + | left_thigh |7 | 6 | + | left_sin |8 | 7 | + | left_foot |9 | 8 | + | right_upper_arm |10 | 9 | + | right_lower_arm |11 | 10 | + | left_upper_arm |12 | 11 | + | left_lower_arm |13 | 12 | The joints are: - | id (for `v2`, `v3`, `v4)` | id (for `v5`) | joint | - | ---| --- | ------------ | - | 0 |excluded| root (note: all values are constant 0) | - | 1 |excluded| root (note: all values are constant 0) | - | 2 |excluded| root (note: all values are constant 0) | - | 3 |excluded| root (note: all values are constant 0) | - | 4 |excluded| root (note: all values are constant 0) | - | 5 |excluded| root (note: all values are constant 0) | - | 6 | 0 | abdomen_z | - | 7 | 1 | abdomen_y | - | 8 | 2 | abdomen_x | - | 9 | 3 | right_hip_x | - | 10 | 4 | right_hip_z | - | 11 | 5 | right_hip_y | - | 12 | 6 | right_knee | - | 13 | 7 | left_hip_x | - | 14 | 8 | left_hiz_z | - | 15 | 9 | left_hip_y | - | 16 | 10 | left_knee | - | 17 | 11 | right_shoulder1 | - | 18 | 12 | right_shoulder2 | - | 19 | 13 | right_elbow| - | 20 | 14 | left_shoulder1 | - | 21 | 15 | left_shoulder2 | - | 22 | 16 | left_elfbow | + | joint | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | root (note: all values are constant 0) | 0 |excluded| + | root (note: all values are constant 0) | 1 |excluded| + | root (note: all values are constant 0) | 2 |excluded| + | root (note: all values are constant 0) | 3 |excluded| + | root (note: all values are constant 0) | 4 |excluded| + | root (note: all values are constant 0) | 5 |excluded| + | abdomen_z | 6 | 0 | + | abdomen_y | 7 | 1 | + | abdomen_x | 8 | 2 | + | right_hip_x | 9 | 3 | + | right_hip_z | 10 | 4 | + | right_hip_y | 11 | 5 | + | right_knee | 12 | 6 | + | left_hip_x | 13 | 7 | + | left_hiz_z | 14 | 8 | + | left_hip_y | 15 | 9 | + | left_knee | 16 | 10 | + | right_shoulder1 | 17 | 11 | + | right_shoulder2 | 18 | 12 | + | right_elbow | 19 | 13 | + | left_shoulder1 | 20 | 14 | + | left_shoulder2 | 21 | 15 | + | left_elfbow | 22 | 16 | The (x,y,z) coordinates are translational DOFs while the orientations are rotational DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - **Note:** Humanoid-v4 environment no longer has the following contact forces issue. - If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 + **Note:** + When using Humanoid-v3 or earlier versions, problems have been reported when using a Mujoco-Py version > 2.0. results in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 when using the Humanoid environment if you would like to report results with contact forces (if contact forces are not used in your experiments, you can use @@ -193,7 +208,8 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of three parts: + The total reward is: ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*. + - *healthy_reward*: Every timestep that the Humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward`. @@ -205,36 +221,33 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): $dt$ is the time between actions, which depends on the `frame_skip` parameter (default is 5), and `frametime` which is 0.001 - so the default is $dt = 5 \times 0.003 = 0.015$, $w_{forward}$ is the `forward_reward_weight` (default is $1.25$). - The center of mass calculation is defined in the `.py` file for the Humanoid. - *ctrl_cost*: A negative reward to penalize the Humanoid for taking actions that are too large. $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $0.1$). - If there are *nu* actuators/controls, then the control has shape `nu x 1`. - *contact_cost*: A negative reward to penalize the Humanoid if the external contact forces are too large. $w_{contact} \times clamp(contact\\_cost\\_range, \\|F_{contact}\\|_2^2)$, where $w_{contact}$ is `contact_cost_weight` (default is $5\times10^{-7}$), $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). - The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. - Note: in `v4` the total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* + **Note:** There is a bug in the `Humanoid-v4` environment that causes *contact_cost* to always be 0. ## Starting State - All observations start in state - (0.0, 0.0, 1.4, 1.0, 0.0 ... 0.0) with a uniform noise in the range - of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally - selected to be high, thereby indicating a standing up humanoid. The initial - orientation is designed to make it face forward as well. + The initial position state is $[0.0, 0.0, 1.4, 1.0, 0.0, ... 0.0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{24}, reset\_noise\_scale \times 1_{24}]}$. + The initial velocity state is $0_{23} + \mathcal{U}_{[-reset\_noise\_scale \times 1_{23}, reset\_noise\_scale \times 1_{23}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z- and x-coordinates are non-zero so that the humanoid can immediately stand up and face forward (x-axis). ## Episode End #### Termination - If `terminate_when_unhealthy is True` (which is the default), the environment terminates when the Humanoid is unhealthy. + If `terminate_when_unhealthy is True` (the default), the environment terminates when the Humanoid is unhealthy. The Humanoid is said to be unhealthy if any of the following happens: 1. The z-position of the torso (the height) is no longer contained in `healthy_z_range`. @@ -244,7 +257,8 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + Humanoid provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -258,7 +272,7 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): | `ctrl_cost_weight` | **float** | `0.1` | Weight for _ctrl_cost_ term (see section on reward) | | `contact_cost_weight` | **float** | `5e-7` | Weight for _contact_cost_ term (see section on reward) | | `contact_cost_range` | **float** | `(-np.inf, 10.0) | Clamps the _contact_cost_ term (see section on reward) | - | `healthy_reward` | **float** | `5.0` | Weight for _healthy_reward_ term (see section on reward) | + | `healthy_reward` | **float** | `5.0` | Weight for _healthy_reward_ term (see section on reward) | | `terminate_when_unhealthy` | **bool** | `True` | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` | | `healthy_z_range` | **tuple** | `(1.0, 2.0)` | The humanoid is considered healthy if the z-coordinate of the torso is in this range | | `reset_noise_scale` | **float** | `1e-2` | Scale of random perturbations of initial position and velocity (see section on Starting State) | @@ -286,22 +300,22 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="humanoid.xml", - frame_skip=5, - default_camera_config=DEFAULT_CAMERA_CONFIG, - forward_reward_weight=1.25, - ctrl_cost_weight=0.1, - contact_cost_weight=5e-7, - contact_cost_range=(-np.inf, 10.0), - healthy_reward=5.0, - terminate_when_unhealthy=True, - healthy_z_range=(1.0, 2.0), - reset_noise_scale=1e-2, - exclude_current_positions_from_observation=True, - include_cinert_in_observation=True, - include_cvel_in_observation=True, - include_qfrc_actuator_in_observation=True, - include_cfrc_ext_in_observation=True, + xml_file: str = "humanoid.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.25, + ctrl_cost_weight: float = 0.1, + contact_cost_weight: float = 5e-7, + contact_cost_range: Tuple[float, float] = (-np.inf, 10.0), + healthy_reward: float = 5.0, + terminate_when_unhealthy: bool = True, + healthy_z_range: Tuple[float, float] = (1.0, 2.0), + reset_noise_scale: float = 1e-2, + exclude_current_positions_from_observation: bool = True, + include_cinert_in_observation: bool = True, + include_cvel_in_observation: bool = True, + include_qfrc_actuator_in_observation: bool = True, + include_cfrc_ext_in_observation: bool = True, **kwargs, ): utils.EzPickle.__init__( diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index a5b4e54de..e48f37027 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -1,5 +1,7 @@ __credits__ = ["Kallinteris-Andreas"] +from typing import Dict, Tuple + import numpy as np from gymnasium import utils @@ -26,14 +28,21 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): and then the goal of the environment is to make the humanoid standup and then keep it standing by applying torques on the various hinges. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | HumanoidStandup-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | HumanoidStandup-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | HumanoidStandup-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". - ## Action Space - The agent take a 17-element vector for actions. - The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action` - represents the numerical torques applied at the hinge joints. + ## Action Space + The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints. - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ | | 0 | Torque applied on the hinge in the y-coordinate of the abdomen | -0.4 | 0.4 | abdomen_y | hinge | torque (N m) | | 1 | Torque applied on the hinge in the z-coordinate of the abdomen | -0.4 | 0.4 | abdomen_z | hinge | torque (N m) | @@ -68,7 +77,7 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): However, by default, the observation is a `Box(-Inf, Inf, (348,), float64)`. The elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | @@ -183,8 +192,8 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html). - **Note:** Humanoid-v4 environment no longer has the following contact forces issue. - If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 + **Note:** + When using Humanoid-v3 or earlier versions, problems have been reported when using a Mujoco-Py version > 2.0. results in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0 when using the Humanoid environment if you would like to report results with contact forces (if contact forces are not used in your experiments, you can use @@ -192,7 +201,8 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of three parts: + The total reward is: ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*. + - *uph_cost*: A reward for moving up (trying to stand up). This is not a relative reward, measuring how far up it has moved since the last timestep, @@ -213,17 +223,16 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): $w_{impact}$ is `impact_cost_weight` (default is $5\times10^{-7}$), $F_{contact}$ are the external contact forces (see `cfrc_ext` section on observation). - The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*, - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. ## Starting State - All observations start in state - (0.0, 0.0, 0.105, 1.0, 0.0 ... 0.0) with a uniform noise in the range of - [-0.01, 0.01] added to the positional and velocity values (values in the table) - for stochasticity. Note that the initial z coordinate is intentionally selected - to be low, thereby indicating a laying down humanoid. The initial orientation is - designed to make it face forward as well. + The initial position state is $[0.0, 0.0, 1.4, 1.0, 0.0, ... 0.0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{24}, reset\_noise\_scale \times 1_{24}]}$. + The initial velocity state is $0_{23} + \mathcal{U}_{[-reset\_noise\_scale \times 1_{23}, reset\_noise\_scale \times 1_{23}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z- and x-coordinates are non-zero so that the humanoid immediately lies down and faces forward (x-axis). ## Episode End @@ -235,7 +244,8 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + HumanoidStandup provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -274,19 +284,19 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="humanoidstandup.xml", - frame_skip=5, - default_camera_config=DEFAULT_CAMERA_CONFIG, - uph_cost_weight=1, - ctrl_cost_weight=0.1, - impact_cost_weight=0.5e-6, - impact_cost_range=(-np.inf, 10.0), - reset_noise_scale=1e-2, - exclude_current_positions_from_observation=True, - include_cinert_in_observation=True, - include_cvel_in_observation=True, - include_qfrc_actuator_in_observation=True, - include_cfrc_ext_in_observation=True, + xml_file: str = "humanoidstandup.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + uph_cost_weight: float = 1, + ctrl_cost_weight: float = 0.1, + impact_cost_weight: float = 0.5e-6, + impact_cost_range: Tuple[float, float] = (-np.inf, 10.0), + reset_noise_scale: float = 1e-2, + exclude_current_positions_from_observation: bool = True, + include_cinert_in_observation: bool = True, + include_cvel_in_observation: bool = True, + include_qfrc_actuator_in_observation: bool = True, + include_cfrc_ext_in_observation: bool = True, **kwargs, ): utils.EzPickle.__init__( diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py index 5c52d3b05..198524981 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -27,6 +27,15 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): and the goal is to balance the second pole on top of the first pole, which is in turn on top of the cart, by applying continuous forces on the cart. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | InvertedDoublePendulum-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | InvertedDoublePendulum-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | InvertedDoublePendulum-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". ## Action Space The agent take a 1-element vector for actions. @@ -34,7 +43,7 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): numerical force applied to the cart (with magnitude representing the amount of force and sign representing the direction) - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint |Type (Unit)| |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| | 0 | Force applied on the cart | -1 | 1 | slider | slide | Force (N) | @@ -46,7 +55,7 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): The observation is a `ndarray` with shape `(9,)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | ----------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | | 1 | sine of the angle between the cart and the first pole | -Inf | Inf | sin(hinge) | hinge | unitless | @@ -72,7 +81,8 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of two parts: + The total reward is: ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*. + - *alive_bonus*: The goal is to keep the second inverted pendulum upright (within a certain angle limit) as long as possible - so for each timestep that the second pole is upright, a reward of `healthy_reward` is given. @@ -85,15 +95,14 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): $10^{-3} \omega_1 + 5 10^{-3} \omega_2$, where $\omega_1, \omega_2$ are the angular velocities of the hinges. - The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty* - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range - of `[-reset_noise_scale, reset_noise_scale]` added to the positional values (cart position and pole angles) and standard - normal force with a standard deviation of `reset_noise_scale` added to the velocity values for stochasticity. + The initial position state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{3}, reset\_noise\_scale \times 1_{3}]}$. + The initial velocity state is $\mathcal{N}(0_{3}, reset\_noise\_scale^2 \times I_{3})$. + + where $\mathcal{N}$ is the multivariate normal distribution and $\mathcal{U}$ is the multivariate uniform continuous distribution. ## Episode End @@ -102,14 +111,16 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): The Inverted Double Pendulum is unhealthy if any of the following happens: 1.Termination: The y_coordinate of the tip of the second pole $\leq 1$. - The maximum standing height of the system is 1.2 m when all the parts are perpendicularly vertical on top of each other). + + Note: The maximum standing height of the system is 1.2 m when all the parts are perpendicularly vertical on top of each other. #### Truncation The maximum duration of an episode is 1000 timesteps. ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`, `healthy_reward`, `reset_noise_scale`, etc. + InvertedDoublePendulum provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -140,10 +151,10 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="inverted_double_pendulum.xml", - frame_skip=5, - healthy_reward=10.0, - reset_noise_scale=0.1, + xml_file: str = "inverted_double_pendulum.xml", + frame_skip: int = 5, + healthy_reward: float = 10.0, + reset_noise_scale: float = 0.1, **kwargs, ): utils.EzPickle.__init__(self, xml_file, frame_skip, reset_noise_scale, **kwargs) diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_pendulum_v5.py index 75b964755..d9d83322a 100644 --- a/gymnasium/envs/mujoco/inverted_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_pendulum_v5.py @@ -25,6 +25,16 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): at one end and having another end free. The cart can be pushed left or right, and the goal is to balance the pole on the top of the cart by applying forces on the cart. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | InvertedPendulum-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | InvertedPendulum-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | InvertedPendulum-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + ## Action Space The agent take a 1-element vector for actions. @@ -33,7 +43,7 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): the numerical force applied to the cart (with magnitude representing the amount of force and sign representing the direction) - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint |Type (Unit)| |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------| | 0 | Force applied on the cart | -3 | 3 | slider | slide | Force (N) | @@ -45,12 +55,12 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | --------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------- | | 0 | position of the cart along the linear surface | -Inf | Inf | slider | slide | position (m) | | 1 | vertical angle of the pole on the cart | -Inf | Inf | hinge | hinge | angle (rad) | | 2 | linear velocity of the cart | -Inf | Inf | slider | slide | velocity (m/s) | - | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | anglular velocity (rad/s) | + | 3 | angular velocity of the pole on the cart | -Inf | Inf | hinge | hinge | angular velocity (rad/s) | ## Rewards @@ -65,6 +75,11 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): ## Starting State + The initial position state is $\\mathcal{U}_{[-reset\\_noise\\_scale \times 1_{2}, reset\\_noise\\_scale \times 1_{2}]}$. + The initial velocity state is $\\mathcal{U}_{[-reset\\_noise\\_scale \times 1_{2}, reset\\_noise\\_scale \times 1_{2}]}$. + + where $\\mathcal{U}$ is the multivariate uniform continuous distribution. + All observations start in state (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range of `[-reset_noise_scale, reset_noise_scale]` added to the values for stochasticity. @@ -83,7 +98,8 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `reset_noise_scale`. + InvertedPendulum provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -113,9 +129,9 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="inverted_pendulum.xml", - frame_skip=2, - reset_noise_scale=0.01, + xml_file: str = "inverted_pendulum.xml", + frame_skip: int = 2, + reset_noise_scale: float = 0.01, **kwargs, ): utils.EzPickle.__init__(self, xml_file, frame_skip, reset_noise_scale, **kwargs) diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index 93a4c32f3..60bbe3746 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -1,5 +1,7 @@ __credits__ = ["Kallinteris-Andreas"] +from typing import Dict + import numpy as np from gymnasium import utils @@ -20,19 +22,29 @@ class PusherEnv(MujocoEnv, utils.EzPickle): The goal is to move a target cylinder (called *object*) to a goal position using the robot's end effector (called *fingertip*). The robot consists of shoulder, elbow, forearm, and wrist joints. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Pusher-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Pusher-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Pusher-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + ## Action Space The action space is a `Box(-2, 2, (7,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | |-----|--------------------------------------------------------------------|-------------|-------------|----------------------------------|-------|--------------| - | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | - | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | - | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | - | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | - | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | - | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | - | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | + | 0 | Rotation of the panning the shoulder | -2 | 2 | r_shoulder_pan_joint | hinge | torque (N m) | + | 1 | Rotation of the shoulder lifting joint | -2 | 2 | r_shoulder_lift_joint | hinge | torque (N m) | + | 2 | Rotation of the shoulder rolling joint | -2 | 2 | r_upper_arm_roll_joint | hinge | torque (N m) | + | 3 | Rotation of hinge joint that flexed the elbow | -2 | 2 | r_elbow_flex_joint | hinge | torque (N m) | + | 4 | Rotation of hinge that rolls the forearm | -2 | 2 | r_forearm_roll_joint | hinge | torque (N m) | + | 5 | Rotation of flexing the wrist | -2 | 2 | r_wrist_flex_joint | hinge | torque (N m) | + | 6 | Rotation of rolling the wrist | -2 | 2 | r_wrist_roll_joint | hinge | torque (N m) | ## Observation Space @@ -48,7 +60,7 @@ class PusherEnv(MujocoEnv, utils.EzPickle): An analogy can be drawn to a human arm in order to help understand the state space, with the words flex and roll meaning the same as human joints. - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | -------------------------------------------------------- | ---- | --- | -------------------------------- | -------- | ------------------------ | | 0 | Rotation of the panning the shoulder | -Inf | Inf | r_shoulder_pan_joint | hinge | angle (rad) | | 1 | Rotation of the shoulder lifting joint | -Inf | Inf | r_shoulder_lift_joint | hinge | angle (rad) | @@ -76,7 +88,8 @@ class PusherEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of two parts: + The total reward is: ***reward*** *=* *reward_dist + reward_ctrl + reward_near*. + - *reward_near*: This reward is a measure of how far the *fingertip* of the pusher (the unattached end) is from the object, with a more negative value assigned for when the pusher's *fingertip* is further away from the target. @@ -92,20 +105,22 @@ class PusherEnv(MujocoEnv, utils.EzPickle): It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. where $w_{control}$ is the `reward_control_weight`. - The total reward returned is ***reward*** *=* *reward_dist + reward_ctrl + reward_near*, - `info` will also contain the individual reward terms. + `info` contains the individual reward terms. ## Starting State - All pusher (not including object and goal) states start in - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0). A uniform noise in the range - [-0.005, 0.005] is added to the velocity attributes only. The velocities of - the object and goal are permanently set to 0. The object's x-position is selected uniformly - between [-0.3, 0] while the y-position is selected uniformly between [-0.2, 0.2], and this - process is repeated until the vector norm between the object's (x,y) position and origin is not greater - than 0.17. The goal always have the same position of (0.45, -0.05, -0.323). + The initial position state of the Pusher arm is $0_{6}$. + The initial position state of the object is $\mathcal{U}_{[[-0.3, -0.2], [0, 0.2]]}$. + The position state of the goal is (permanently) $[0.45, -0.05, -0.323]$. + The initial velocity state of the Pusher arm is $\mathcal{U}_{[-0.005 \times 1_{6}, 0.005 \times 1_{6}]}$. + The initial velocity state of the object is $0_2$. + The velocity state of the goal is (permanently) $0_3$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the initial position state of the object is sampled until it's distance to the goal is $ > 0.17 m$. - The default framerate is 5 with each frame lasting for 0.01, giving rise to a *dt = 5 * 0.01 = 0.05* + The default frame rate is 5, with each frame lasting for 0.01, so *dt = 5 * 0.01 = 0.05*. ## Episode End @@ -117,7 +132,8 @@ class PusherEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`. + Pusher provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -149,12 +165,12 @@ class PusherEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="pusher.xml", - frame_skip=5, - default_camera_config=DEFAULT_CAMERA_CONFIG, - reward_near_weight=0.5, - reward_dist_weight=1, - reward_control_weight=0.1, + xml_file: str = "pusher.xml", + frame_skip: int = 5, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + reward_near_weight: float = 0.5, + reward_dist_weight: float = 1, + reward_control_weight: float = 0.1, **kwargs, ): utils.EzPickle.__init__( diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index e986183d4..05ad477e2 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -1,5 +1,7 @@ __credits__ = ["Kallinteris-Andreas"] +from typing import Dict + import numpy as np from gymnasium import utils @@ -7,7 +9,7 @@ from gymnasium.spaces import Box -DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} +DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0.0} class ReacherEnv(MujocoEnv, utils.EzPickle): @@ -16,14 +18,24 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): "Reacher" is a two-jointed robot arm. The goal is to move the robot's end effector (called *fingertip*) close to a target that is spawned at a random position. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Reacher-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Reacher-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Reacher-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + ## Action Space The action space is a `Box(-1, 1, (2,), float32)`. An action `(a, b)` represents the torques applied at the hinge joints. - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | - |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------|-------|------| - | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | - | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | + | Num | Action | Control Min | Control Max |Name (in corresponding XML file)| Joint | Type (Unit) | + |-----|---------------------------------------------------------------------------------|-------------|-------------|--------------------------------|-------|--------------| + | 0 | Torque applied at the first hinge (connecting the link to the point of fixture) | -1 | 1 | joint0 | hinge | torque (N m) | + | 1 | Torque applied at the second hinge (connecting the two links) | -1 | 1 | joint1 | hinge | torque (N m) | ## Observation Space @@ -37,7 +49,7 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): The observation is a `Box(-Inf, Inf, (10,), float64)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | ---------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | | 0 | cosine of the angle of the first arm | -Inf | Inf | cos(joint0) | hinge | unitless | | 1 | cosine of the angle of the second arm | -Inf | Inf | cos(joint1) | hinge | unitless | @@ -66,7 +78,8 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of two parts: + The total reward is: ***reward*** *=* *reward_distance + reward_control*. + - *reward_distance*: This reward is a measure of how far the *fingertip* of the reacher (the unattached end) is from the target, with a more negative value assigned for if the reacher's *fingertip* is further away from the target. @@ -77,19 +90,18 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): It is measured as the negative squared Euclidean norm of the action, i.e. as $-w_{control} \|action\|_2^2$. where $w_{control}$ is the `reward_control_weight`. - The total reward returned is ***reward*** *=* *reward_distance + reward_control*. - `info` will also contain the individual reward terms. + `info` contains the individual reward terms. ## Starting State - All observations start in state - (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a noise added for stochasticity. A uniform noise in the range - [-0.1, 0.1] is added to the positional attributes, while the target position - is selected uniformly at random in a disk of radius 0.2 around the origin. - Independent, uniform noise in the - range of [-0.005, 0.005] is added to the velocities, and the last - element ("fingertip" - "target") is calculated at the end once everything - is set. The default setting has a framerate of 2 and a *dt = 2 * 0.01 = 0.02* + The initial position state of the reacher arm is $\mathcal{U}_{[-0.1 \times 1_{2}, 0.1 \times 1_{2}]}$. + The position state of the goal is (permanently) $\mathcal{S}(0.2)$. + The initial velocity state of the Reacher arm is $\mathcal{U}_{[-0.005 \times 1_{2}, 0.005 \times 1_{2}]}$. + The velocity state of the object is (permanently) $0_2$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution and $\mathcal{S} is the uniform continuous spherical distribution. + + The default frame rate is 2, with each frame lasting for 0.01, so *dt = 2 * 0.01 = 0.02*. + ## Episode End #### Termination @@ -100,7 +112,8 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`. + Reacher provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -131,11 +144,11 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="reacher.xml", - frame_skip=2, - default_camera_config=DEFAULT_CAMERA_CONFIG, - reward_dist_weight=1, - reward_control_weight=1, + xml_file: str = "reacher.xml", + frame_skip: int = 2, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + reward_dist_weight: float = 1, + reward_control_weight: float = 1, **kwargs, ): utils.EzPickle.__init__( diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 8b647124a..7803dcfac 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -34,11 +34,22 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): and *k* = 0.1. It is possible to pass a custom MuJoCo XML file during construction to increase the number of links, or to tweak any of the parameters. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Swimmer-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Swimmer-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Swimmer-v3 | `mujoco-py` | Maintained for reproducibility | + | Swimmer-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + ## Action Space The action space is a `Box(-1, 1, (2,), float32)`. An action represents the torques applied between *links* - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------| | 0 | Torque applied on the first rotor | -1 | 1 | motor1_rot | hinge | torque (N m) | | 1 | Torque applied on the second rotor | -1 | 1 | motor2_rot | hinge | torque (N m) | @@ -49,16 +60,14 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): * θi: angle of part *i* with respect to the *x* axis * θi': its derivative with respect to time (angular velocity) - In the default case, observations do not include the x- and y-coordinates of the front tip. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - Then, the observation space will be `Box(-Inf, Inf, (10,), float64)` where the first two observations - represent the x- and y-coordinates of the front tip. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + By default, the observation does not include the x- and y-coordinates of the front tip. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (10,), float64)`, where the first two observations are the x- and y-coordinates of the front tip. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. By default, the observation is a `Box(-Inf, Inf, (8,), float64)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ | | 0 | angle of the front tip | -Inf | Inf | free_body_rot | hinge | angle (rad) | | 1 | angle of the first rotor | -Inf | Inf | motor1_rot | hinge | angle (rad) | @@ -73,7 +82,8 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of two parts: + The total reward is: ***reward*** *=* *forward_reward - ctrl_cost*. + - *forward_reward*: A reward for moving forward, this reward would be positive if the Swimmer moves forward (in the positive $x$ direction / in the right direction). @@ -87,12 +97,14 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-4}$). - The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost*, - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. ## Starting State - All observations start in state (0,0,0,0,0,0,0,0) with a Uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the initial state for stochasticity. + The initial position state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{5}, reset\_noise\_scale \times 1_{5}]}$. + The initial velocity state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{5}, reset\_noise\_scale \times 1_{5}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. ## Episode End @@ -104,7 +116,8 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`. + Swimmer provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -138,12 +151,12 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="swimmer.xml", - frame_skip=4, - forward_reward_weight=1.0, - ctrl_cost_weight=1e-4, - reset_noise_scale=0.1, - exclude_current_positions_from_observation=True, + xml_file: str = "swimmer.xml", + frame_skip: int = 4, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-4, + reset_noise_scale: float = 0.1, + exclude_current_positions_from_observation: bool = True, **kwargs, ): utils.EzPickle.__init__( diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index d2d12a0ed..84b21fb63 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -1,5 +1,7 @@ __credits__ = ["Kallinteris-Andreas"] +from typing import Dict, Tuple + import numpy as np from gymnasium import utils @@ -28,11 +30,22 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): The goal is to walk in the in the forward (right) direction by applying torques on the six hinges connecting the seven body parts. + Gymnasium includes the following versions of the environment: + + | Environment | Binding | Notes | + | ------------------------- | --------------- | ------------------------------------------- | + | Walker2d-v5 | `mujoco=>2.3.3` | Recommended (most features, the least bugs) | + | Walker2d-v4 | `mujoco=>2.1.3` | Maintained for reproducibility | + | Walker2d-v3 | `mujoco-py` | Maintained for reproducibility | + | Walker2d-v2 | `mujoco-py` | Maintained for reproducibility | + + For more information see section "Version History". + ## Action Space The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints. - | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Action | Control Min | Control Max | Name (in corresponding XML file) | Joint | Type (Unit) | |-----|----------------------------------------|-------------|-------------|----------------------------------|-------|--------------| | 0 | Torque applied on the thigh rotor | -1 | 1 | thigh_joint | hinge | torque (N m) | | 1 | Torque applied on the leg rotor | -1 | 1 | leg_joint | hinge | torque (N m) | @@ -43,19 +56,20 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): ## Observation Space - Observations consist of positional values of different body parts of the walker, - followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities. + The observation Space consists of the following parts (in order): - By default, observations do not include the x-coordinate of the torso. It may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be `Box(-Inf, Inf, (18,), float64)` where the first observation - represent the x-coordinates of the torso of the walker. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate - of the torso will be returned in `info` with key `"x_position"`. + - qpos:* Position values of the robots's body parts. + - qvel:* The velocities of these individual body parts, + (their derivatives). + + By default, the observation does not include the robot's x-coordinate (`rootx`). + This can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (18,), float64)`, where the first observation element is the x--coordinate of the robot. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. By default, observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following: - | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Unit | + | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ | | excluded | x-coordinate of the torso | -Inf | Inf | rootx | slide | position (m) | | 0 | z-coordinate of the torso (height of Walker2d) | -Inf | Inf | rootz | slide | position (m) | @@ -78,7 +92,8 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): ## Rewards - The reward consists of three parts: + The total reward is: ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost*. + - *healthy_reward*: Every timestep that the Walker2d is alive, it receives a fixed reward of value `healthy_reward`, - *forward_reward*: @@ -94,14 +109,16 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): $w_{control} \times \\|action\\|_2^2$, where $w_{control}$ is `ctrl_cost_weight` (default is $10^{-3}$). - The total reward returned is ***reward*** *=* *healthy_reward bonus + forward_reward - ctrl_cost* - and `info` will also contain the individual reward terms. + `info` contains the individual reward terms. ## Starting State - All observations start in state - (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) - with a uniform noise in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity. + The initial position state is $[0, 1.25, 0, 0, 0, 0, 0, 0, 0] + \mathcal{U}_{[-reset\_noise\_scale \times 1_{9}, reset\_noise\_scale \times 1_{9}]}$. + The initial velocity state is $\mathcal{U}_{[-reset\_noise\_scale \times 1_{9}, reset\_noise\_scale \times 1_{9}]}$. + + where $\mathcal{U}$ is the multivariate uniform continuous distribution. + + Note that the z-coordinate is non-zero so that the walker2d can stand up immediately. ## Episode End @@ -120,7 +137,8 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): ## Arguments - `gymnasium.make` takes additional arguments such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. + Walker2d provides a range of parameters to modify the observation space, reward function, initial state, and termination condition. + These parameters can be applied during `gymnasium.make` in the following way: ```python import gymnasium as gym @@ -158,17 +176,17 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): def __init__( self, - xml_file="walker2d_v5.xml", - frame_skip=4, - default_camera_config=DEFAULT_CAMERA_CONFIG, - forward_reward_weight=1.0, - ctrl_cost_weight=1e-3, - healthy_reward=1.0, - terminate_when_unhealthy=True, - healthy_z_range=(0.8, 2.0), - healthy_angle_range=(-1.0, 1.0), - reset_noise_scale=5e-3, - exclude_current_positions_from_observation=True, + xml_file: str = "walker2d_v5.xml", + frame_skip: int = 4, + default_camera_config: Dict[str, float] = DEFAULT_CAMERA_CONFIG, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-3, + healthy_reward: float = 1.0, + terminate_when_unhealthy: bool = True, + healthy_z_range: Tuple[float, float] = (0.8, 2.0), + healthy_angle_range: Tuple[float, float] = (-1.0, 1.0), + reset_noise_scale: float = 5e-3, + exclude_current_positions_from_observation: bool = True, **kwargs, ): utils.EzPickle.__init__( From 2bb1687947e91c8aa8b0d42baeba4515c0071108 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Fri, 14 Jul 2023 19:08:19 +0300 Subject: [PATCH 48/53] add `test_reset_noise_scale` --- tests/envs/mujoco/test_mujoco_v5.py | 124 ++++++++++++++-------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/tests/envs/mujoco/test_mujoco_v5.py b/tests/envs/mujoco/test_mujoco_v5.py index dfaef1008..5dfd4cb3a 100644 --- a/tests/envs/mujoco/test_mujoco_v5.py +++ b/tests/envs/mujoco/test_mujoco_v5.py @@ -1,3 +1,4 @@ +import collections import warnings import mujoco @@ -314,76 +315,40 @@ def test_reward_sum(version: str): ) +env_conf = collections.namedtuple("env_conf", "env_name, obs, rew, term, info") + + # Note: the environtments "HalfCheetah", "Pusher", "Swimmer", are identical between `v4` & `v5` (excluding `info`) -def test_identical_behaviour_v45(): +@pytest.mark.parametrize( + "env_conf", + [ + env_conf("Ant", True, True, False, "skip"), + env_conf("HalfCheetah", False, False, False, "skip"), + env_conf("Hopper", False, True, False, "superset"), + # skipping humanoid, everything has changed + env_conf("HumanoidStandup", True, False, False, "superset"), + env_conf("InvertedDoublePendulum", True, True, False, "superset"), + env_conf("InvertedPendulum", False, True, False, "superset"), + env_conf("Pusher", False, False, False, "keys-superset"), + env_conf("Reacher", True, False, False, "keys-equivalence"), + env_conf("Swimmer", False, False, False, "skip"), + env_conf("Walker2d", True, True, True, "keys-superset"), + ], +) +def test_identical_behaviour_v45(env_conf): """Verify that v4 -> v5 transition. Does not change the behaviour of the environments in any unexpected way.""" NUM_STEPS = 100 + env_v4 = gym.make(f"{env_conf.env_name}-v4") + env_v5 = gym.make(f"{env_conf.env_name}-v5") - env_v4 = gym.make("Ant-v4") - env_v5 = gym.make("Ant-v5", include_cfrc_ext_in_observation=False) - check_environments_match( - env_v4, env_v5, NUM_STEPS, skip_rew=True, info_comparison="skip" - ) - - env_v4 = gym.make("HalfCheetah-v4") - env_v5 = gym.make("HalfCheetah-v5") - check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") - - env_v4 = gym.make("Hopper-v4") - env_v5 = gym.make("Hopper-v5") - check_environments_match( - env_v4, env_v5, NUM_STEPS, skip_rew=True, info_comparison="superset" - ) - - # skipping humanoid, everything has changed - - env_v4 = gym.make("HumanoidStandup-v4") - env_v5 = gym.make("HumanoidStandup-v5") - check_environments_match( - env_v4, env_v5, NUM_STEPS, skip_obs=True, info_comparison="superset" - ) - - env_v4 = gym.make("InvertedDoublePendulum-v4") - env_v5 = gym.make("InvertedDoublePendulum-v5") check_environments_match( env_v4, env_v5, NUM_STEPS, - skip_obs=True, - skip_rew=True, - info_comparison="superset", - ) - - env_v4 = gym.make("InvertedPendulum-v4") - env_v5 = gym.make("InvertedPendulum-v5") - check_environments_match( - env_v4, env_v5, NUM_STEPS, skip_rew=True, info_comparison="superset" - ) - - env_v4 = gym.make("Pusher-v4") - env_v5 = gym.make("Pusher-v5") - check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="keys-superset") - - env_v4 = gym.make("Reacher-v4") - env_v5 = gym.make("Reacher-v5") - check_environments_match( - env_v4, env_v5, NUM_STEPS, skip_obs=True, info_comparison="keys-equivalence" - ) - - env_v4 = gym.make("Swimmer-v4") - env_v5 = gym.make("Swimmer-v5") - check_environments_match(env_v4, env_v5, NUM_STEPS, info_comparison="skip") - - env_v4 = gym.make("Walker2d-v4") - env_v5 = gym.make("Walker2d-v5") - check_environments_match( - env_v4, - env_v5, - NUM_STEPS, - skip_obs=True, - skip_rew=True, - skip_terminal=True, - info_comparison="keys-superset", + skip_obs=env_conf.obs, + skip_rew=env_conf.rew, + skip_terminal=env_conf.term, + info_comparison=env_conf.info, ) @@ -674,3 +639,38 @@ def test_dt(): assert env_a.dt == env_b.dt # check_environments_match(env_a, env_b, num_steps=100) # This Fails as expected + + +@pytest.mark.parametrize( + "env_id", + [ + "Ant-v5", + "Ant-v4", + "Ant-v3", + "HalfCheetah-v5", + "HalfCheetah-v4", + "HalfCheetah-v3", + "Hopper-v5", + "Hopper-v4", + "Hopper-v3", + "Humanoid-v5", + "Humanoid-v4", + "Humanoid-v3", + "HumanoidStandup-v5", + "InvertedDoublePendulum-v5", + "InvertedPendulum-v5", + "Swimmer-v5", + "Swimmer-v4", + "Swimmer-v3", + "Walker2d-v5", + "Walker2d-v4", + "Walker2d-v3", + ], +) +def test_reset_noise_scale(env_id): + """Checks that when `reset_noise_scale=0` we have deterministic initialization.""" + env: BaseMujocoEnv = gym.make(env_id, reset_noise_scale=0).unwrapped + env.reset() + + assert np.all(env.data.qpos == env.init_qpos) + assert np.all(env.data.qvel == env.init_qvel) From 0c2c45bd02c1fc44113f5f44494dd3dcffbf2349 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Fri, 14 Jul 2023 19:48:46 +0300 Subject: [PATCH 49/53] cleanup `humanoidstandup` --- gymnasium/envs/mujoco/humanoidstandup_v5.py | 175 ++++++++++---------- 1 file changed, 88 insertions(+), 87 deletions(-) diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index e48f37027..58474ee5f 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -64,18 +64,33 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): ## Observation Space - Observations consist of positional values of different body parts of the Humanoid, - followed by the velocities of those individual parts (their derivatives) with all the - positions ordered before all the velocities. - - By default, observations do not include the x- and y-coordinates of the torso. These may - be included by passing `exclude_current_positions_from_observation=False` during construction. - In that case, the observation space will be a `Box(-Inf, Inf, (350,), float64)` where the first two observations - represent the x- and y-coordinates of the torso. - Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates - will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively. - - However, by default, the observation is a `Box(-Inf, Inf, (348,), float64)`. The elements correspond to the following: + The observation space consists of the following parts (in order) + + - qpos:* The position values of the robot's body parts. + - qvel:* The velocities of these individual body parts, + (their derivatives). + - *cinert:* Mass and inertia of the rigid body parts relative to the center of mass, + (this is an intermediate result of the transition). + It has shape 13*10 (*nbody * 10*) and thus adds another 130 elements to the observation space. + (cinert - inertia matrix and body mass offset and body mass) + - *cvel:* Center of mass based velocity. + It has shape 13 * 6 (*nbody * 6*) and thus adds another 78 elements to the observation space. + (com velocity - velocity x, y, z and angular velocity x, y, z) + - *qfrc_actuator:* Constraint force generated as the actuator force at each joint. + This has shape `(17,)` *(nv * 1)* and thus adds another 17 elements to the observation space. + - *cfrc_ext:* This is the center of mass based external force on the body parts. + It has shape 13 * 6 (*nbody * 6*) and thus adds to another 78 elements in the observation space. + (external forces - force x, y, z and torque x, y, z) + + where *nbody* is for the number of bodies in the robot + and *nv* is for the number of degrees of freedom (*= dim(qvel)*). + + By default, the observation does not include the x- and y-coordinates of the torso. + These can be be included by passing `exclude_current_positions_from_observation=False` during construction. + In this case, the observation space will be a `Box(-Inf, Inf, (350,), float64)`, where the first two observations are the x- and y-coordinates of the torso. + Regardless of whether `exclude_current_positions_from_observation` is set to true or false, the x- and y-coordinates are returned in `info` with keys `"x_position"` and `"y_position"`, respectively. + + By default, however, the observation space is a `Box(-Inf, Inf, (348,), float64)`, where the position and velocity elements are as follows: | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- | @@ -104,89 +119,75 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): | 22 | x-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | | 23 | y-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | | 24 | z-coordinate velocity of the torso (centre) | -Inf | Inf | root | free | velocity (m/s) | - | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | anglular velocity (rad/s) | - | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | anglular velocity (rad/s) | - | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | anglular velocity (rad/s) | - | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | aanglular velocity (rad/s) | - | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | anglular velocity (rad/s) | - | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | anglular velocity (rad/s) | - | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | anglular velocity (rad/s) | - | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | anglular velocity (rad/s) | - | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | anglular velocity (rad/s) | - | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | anglular velocity (rad/s) | - | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | anglular velocity (rad/s) | - | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | anglular velocity (rad/s) | - | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | anglular velocity (rad/s) | - | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | anglular velocity (rad/s) | - | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | anglular velocity (rad/s) | - | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | anglular velocity (rad/s) | - | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | anglular velocity (rad/s) | - | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | anglular velocity (rad/s) | + | 25 | x-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 26 | y-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 27 | z-coordinate angular velocity of the torso (centre) | -Inf | Inf | root | free | angular velocity (rad/s) | + | 28 | z-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_z | hinge | angular velocity (rad/s) | + | 29 | y-coordinate of angular velocity of the abdomen (in lower_waist) | -Inf | Inf | abdomen_y | hinge | angular velocity (rad/s) | + | 30 | x-coordinate of angular velocity of the abdomen (in pelvis) | -Inf | Inf | abdomen_x | hinge | angular velocity (rad/s) | + | 31 | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_x | hinge | angular velocity (rad/s) | + | 32 | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_z | hinge | angular velocity (rad/s) | + | 33 | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh) | -Inf | Inf | right_hip_y | hinge | angular velocity (rad/s) | + | 34 | angular velocity of the angle between right hip and the right shin (in right_knee) | -Inf | Inf | right_knee | hinge | angular velocity (rad/s) | + | 35 | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_x | hinge | angular velocity (rad/s) | + | 36 | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_z | hinge | angular velocity (rad/s) | + | 37 | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh) | -Inf | Inf | left_hip_y | hinge | angular velocity (rad/s) | + | 38 | angular velocity of the angle between left hip and the left shin (in left_knee) | -Inf | Inf | left_knee | hinge | angular velocity (rad/s) | + | 39 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1 | hinge | angular velocity (rad/s) | + | 40 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2 | hinge | angular velocity (rad/s) | + | 41 | angular velocity of the angle between right upper arm and right_lower_arm | -Inf | Inf | right_elbow | hinge | angular velocity (rad/s) | + | 42 | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder1 | hinge | angular velocity (rad/s) | + | 43 | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm) | -Inf | Inf | left_shoulder2 | hinge | angular velocity (rad/s) | + | 44 | angular velocity of the angle between left upper arm and left_lower_arm | -Inf | Inf | left_elbow | hinge | angular velocity (rad/s) | | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | - Additionally, after all the positional and velocity based values in the table, - the observation contains (in order): - - *cinert:* Mass and inertia of a single rigid body relative to the center of mass - (this is an intermediate result of transition). It has shape 13*10 (*nbody * 10*) - and hence adds to another 130 elements in the state space. - - *cvel:* Center of mass based velocity. It has shape 13 * 6 (*nbody * 6*) and hence - adds another 78 elements in the state space - - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape - `(17,)` *(nv * 1)* and hence adds another 17 elements to the state space. - - *cfrc_ext:* This is the center of mass based external force on the body. It has shape - 13 * 6 (*nbody * 6*) and hence adds to another 78 elements in the state space. - where *nbody* stands for the number of bodies in the robot and *nv* stands for the - number of degrees of freedom (*= dim(qvel)*) - The body parts are: - | id (for `v2`, `v3`, `v4)` | id (for `v5`) | body part | - | ---| --- | ------------ | - | 0 |excluded| worldbody (note: all values are constant 0) | - | 1 | 0 | torso | - | 2 | 1 | lwaist | - | 3 | 2 | pelvis | - | 4 | 3 | right_thigh | - | 5 | 4 | right_sin | - | 6 | 5 | right_foot | - | 7 | 6 | left_thigh | - | 8 | 7 | left_sin | - | 9 | 8 | left_foot | - | 10 | 9 | right_upper_arm | - | 11 | 10 | right_lower_arm | - | 12 | 11 | left_upper_arm | - | 13 | 12 | left_lower_arm | + | body part | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | worldbody (note: all values are constant 0) | 0 |excluded| + | torso |1 | 0 | + | lwaist |2 | 1 | + | pelvis |3 | 2 | + | right_thigh |4 | 3 | + | right_sin |5 | 4 | + | right_foot |6 | 5 | + | left_thigh |7 | 6 | + | left_sin |8 | 7 | + | left_foot |9 | 8 | + | right_upper_arm |10 | 9 | + | right_lower_arm |11 | 10 | + | left_upper_arm |12 | 11 | + | left_lower_arm |13 | 12 | The joints are: - | id (for `v2`, `v3`, `v4)` | id (for `v5`) | joint | - | ---| --- | ------------ | - | 0 |excluded| root (note: all values are constant 0) | - | 1 |excluded| root (note: all values are constant 0) | - | 2 |excluded| root (note: all values are constant 0) | - | 3 |excluded| root (note: all values are constant 0) | - | 4 |excluded| root (note: all values are constant 0) | - | 5 |excluded| root (note: all values are constant 0) | - | 6 | 0 | abdomen_z | - | 7 | 1 | abdomen_y | - | 8 | 2 | abdomen_x | - | 9 | 3 | right_hip_x | - | 10 | 4 | right_hip_z | - | 11 | 5 | right_hip_y | - | 12 | 6 | right_knee | - | 13 | 7 | left_hip_x | - | 14 | 8 | left_hiz_z | - | 15 | 9 | left_hip_y | - | 16 | 10 | left_knee | - | 17 | 11 | right_shoulder1 | - | 18 | 12 | right_shoulder2 | - | 19 | 13 | right_elbow| - | 20 | 14 | left_shoulder1 | - | 21 | 15 | left_shoulder2 | - | 22 | 16 | left_elfbow | + | joint | id (for `v2`, `v3`, `v4)` | id (for `v5`) | + | ------------- | --- | --- | + | root (note: all values are constant 0) | 0 |excluded| + | root (note: all values are constant 0) | 1 |excluded| + | root (note: all values are constant 0) | 2 |excluded| + | root (note: all values are constant 0) | 3 |excluded| + | root (note: all values are constant 0) | 4 |excluded| + | root (note: all values are constant 0) | 5 |excluded| + | abdomen_z | 6 | 0 | + | abdomen_y | 7 | 1 | + | abdomen_x | 8 | 2 | + | right_hip_x | 9 | 3 | + | right_hip_z | 10 | 4 | + | right_hip_y | 11 | 5 | + | right_knee | 12 | 6 | + | left_hip_x | 13 | 7 | + | left_hiz_z | 14 | 8 | + | left_hip_y | 15 | 9 | + | left_knee | 16 | 10 | + | right_shoulder1 | 17 | 11 | + | right_shoulder2 | 18 | 12 | + | right_elbow | 19 | 13 | + | left_shoulder1 | 20 | 14 | + | left_shoulder2 | 21 | 15 | + | left_elfbow | 22 | 16 | The (x,y,z) coordinates are translational DOFs while the orientations are rotational DOFs expressed as quaternions. One can read more about free joints on the From cc65b42d034c881a8fd7bba17a607c4aeec4379f Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sun, 16 Jul 2023 01:11:13 +0300 Subject: [PATCH 50/53] add changelog in "Version History" --- gymnasium/envs/mujoco/ant_v5.py | 17 +++++++++++++++++ gymnasium/envs/mujoco/half_cheetah_v5.py | 9 +++++++++ gymnasium/envs/mujoco/hopper_v5.py | 11 +++++++++++ gymnasium/envs/mujoco/humanoid_v5.py | 18 ++++++++++++++++++ gymnasium/envs/mujoco/humanoidstandup_v5.py | 16 ++++++++++++++++ .../envs/mujoco/inverted_double_pendulum_v5.py | 9 +++++++++ gymnasium/envs/mujoco/inverted_pendulum_v5.py | 9 +++++++++ gymnasium/envs/mujoco/pusher_v5.py | 8 ++++++++ gymnasium/envs/mujoco/reacher_v5.py | 8 ++++++++ gymnasium/envs/mujoco/swimmer_v5.py | 12 ++++++++++++ gymnasium/envs/mujoco/walker2d_v5.py | 12 ++++++++++++ 11 files changed, 129 insertions(+) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 18d7de83c..f44532c9f 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -208,6 +208,23 @@ class AntEnv(MujocoEnv, utils.EzPickle): |`use_contact_forces` (`v4` only) | **bool** | `False` | If true, it extends the observation space by adding contact forces (see `Observation Space` section) and includes contact_cost to the reward function (see `Rewards` section) | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Ant is unhealthy), now it is only given when the Ant is healthy. The `info["reward_survive"]` is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - The reward function now always includes `contact_cost`, before it was only included if `use_contact_forces=True` (can be set to `0` with `contact_cost_weight=0`). + - Excluded the `cfrc_ext` of `worldbody` from the observation space as it was always 0, and thus provided no useful information to the agent, resulting is slightly faster training (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Added the `main_body` argument, which specifies the body used to compute the forward reward (mainly useful for custom MuJoCo models). + - Added the `forward_reward_weight` argument, which defaults to `1` (effectively the same behavior as in `v4`). + - Added the `include_cfrc_ext_in_observation` argument, previously in `v4` the inclusion of `cfrc_ext` observations was controlled by `use_contact_forces` which defaulted to `False`, while `include_cfrc_ext_in_observation` defaults to `True`. + - Removed the `use_contact_forces` argument (note: its functionality has been replaced by `include_cfrc_ext_in_observation` and `contact_cost_weight`) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/214)). + - Fixed `info["reward_ctrl"]` sometimes containing `contact_cost` instead of `ctrl_cost`. + - Fixed `info["x_position"]` & `info["y_position"]` & `info["distance_from_origin"]` giving `xpos` instead of `qpos` observations (`xpos` observations are behind 1 `mj_step()` more [here](https://github.com/deepmind/mujoco/issues/889#issuecomment-1568896388)) (related [Github issue #1](https://github.com/Farama-Foundation/Gymnasium/issues/521) & [Github issue #2](https://github.com/Farama-Foundation/Gymnasium/issues/539)). + - Removed `info["forward_reward"]` as it is equivalent to `info["reward_forward"]`. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3, also removed contact forces from the default observation space (new variable `use_contact_forces=True` can restore them) * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) * v2: All continuous control environments now use mujoco-py >= 1.50 diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py index 2738b459c..8fa3319fe 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v5.py +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -141,6 +141,15 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Restored the `xml_file` argument (was removed in `v4`). + - Renamed `info["reward_run"]` to `info["reward_forward"]` to be consistent with the other environments. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen). * v2: All continuous control environments now use mujoco-py >= 1.50. diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 2341e751a..35d9edf56 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -156,6 +156,17 @@ class HopperEnv(MujocoEnv, utils.EzPickle): | `exclude_current_positions_from_observation` | **bool** | `True` | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Hopper was unhealthy), now it is only given when the Hopper is healthy. The `info["reward_survive"]` is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added individual reward terms in `info` (`info["reward_forward"]`, info`["reward_ctrl"]`, `info["reward_survive"]`). + - Added `info["z_distance_from_origin"]` which is equal to the vertical distance of the "torso" body from its initial position. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) * v2: All continuous control environments now use mujoco-py >= 1.50. diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index 26bb5743d..e3359562d 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -283,6 +283,24 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): | `include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Humanoid was unhealthy), now it is only given when the Humanoid is healthy. The `info["reward_survive"]` is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - Restored `contact_cost` and the corresponding `contact_cost_weight` and `contact_cost_range` arguments, with the same defaults as in `Humanoid-v3` (was removed in `v4`) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/504)). + - Excluded the `cinert` & `cvel` & `cfrc_ext` of `worldbody` and `root`/`freejoint` `qfrc_actuator` from the observation space, as it was always 0, and thus provided no useful information to the agent, resulting in slightly faster training) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added `include_cinert_in_observation`, `include_cvel_in_observation`, `include_qfrc_actuator_in_observation`, `include_cfrc_ext_in_observation` arguments to allow for the exclusion of observation elements from the observation space. + - Fixed `info["x_position"]` & `info["y_position"]` & `info["distance_from_origin"]` returning `xpos` instead of `qpos` based observations (`xpos` observations are behind 1 `mj_step()` more [here](https://github.com/deepmind/mujoco/issues/889#issuecomment-1568896388)) (related [Github issue #1](https://github.com/Farama-Foundation/Gymnasium/issues/521) & [Github issue #2](https://github.com/Farama-Foundation/Gymnasium/issues/539)). + - Added `info["tendon_length"]` and `info["tendon_velocity"]` containing observations of the Humanoid's 2 tendons connecting the hips to the knees. + - Renamed `info["reward_alive"]` to `info["reward_survive"]` to be consistent with the other environments. + - Renamed `info["reward_linvel"]` to `info["reward_forward"]` to be consistent with the other environments. + - Renamed `info["reward_quadctrl"]` to `info["reward_ctrl"]` to be consistent with the other environments. + - Removed `info["forward_reward"]` as it is equivalent to `info["reward_forward"]`. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) * v2: All continuous control environments now use mujoco-py >= 1.50 diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index 58474ee5f..34c2f62f2 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -268,6 +268,22 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): | `include_cfrc_ext_in_observation` | **bool** | `True` | Whether to include *cfrc_ext* elements in the observations. | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Excluded the `cinert` & `cvel` & `cfrc_ext` of `worldbody` and `root`/`freejoint` `qfrc_actuator` from the observation space, as it was always 0, and thus provided no useful information to the agent, resulting in slightly faster training) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added `xml_file` argument. + - Added `uph_cost_weight`, `ctrl_cost_weight`, `impact_cost_weight`, `impact_cost_range` arguments, to configure the reward function (defaults are effectively the same as in `v4`). + - Added `reset_noise_scale` argument, to set the range of initial states. + - Added `include_cinert_in_observation`, `include_cvel_in_observation`, `include_qfrc_actuator_in_observation`, `include_cfrc_ext_in_observation` arguments to allow for the exclusion of observation elements from the observation space. + - Added `info["tendon_length"]` and `info["tendon_velocity"]` containing observations of the Humanoid's 2 tendons connecting the hips to the knees. + - Added `info["x_position"]` & `info["y_position"]` , which contain the observations excluded when `exclude_current_positions_from_observation == True`. + - Added `info["z_distance_from_origin"]` which is equal to the vertical distance of the "torso" body from its initial position. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. * v3: This environment does not have a v3 release. * v2: All continuous control environments now use mujoco-py >= 1.50. diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py index 198524981..64389404d 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -134,6 +134,15 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): | `reset_noise_scale` | **float** | `0.1` | Scale of random perturbations of initial position and velocity (see section on Starting State) | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Pendulum is unhealthy), now it is only given if the DoublePendulum is healthy (not terminated)(related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/500)). + - Excluded the `qfrc_constraint` ("constraint force") of the hinges from the observation space (as it was always 0, thus providing no useful information to the agent, resulting is slightly faster training) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/228)). + - Added `xml_file` argument. + - Added `reset_noise_scale` argument, to set the range of initial states. + - Added `healthy_reward` argument to configure the reward function (defaults are effectively the same as in `v4`). + - Added individual reward terms in `info` ( `info["reward_survive"]`, `info["distance_penalty"]`, `info["velocity_penalty"]`). * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 * v3: This environment does not have a v3 release. * v2: All continuous control environments now use mujoco-py >= 1.50 diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_pendulum_v5.py index d9d83322a..50962730c 100644 --- a/gymnasium/envs/mujoco/inverted_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_pendulum_v5.py @@ -112,6 +112,15 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): | `reset_noise_scale` | **float** | `0.01` | Scale of random perturbations of initial position and velocity (see section on Starting State) | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Fixed bug: `healthy_reward` was given on every step (even if the Pendulum is unhealthy), now it is only given if the Pendulum is healthy (not terminated) (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/500)). + - Added `xml_file` argument. + - Added `reset_noise_scale` argument to set the range of initial states. + - Added `info["reward_survive"]` which contains the reward. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.. * v3: This environment does not have a v3 release. * v2: All continuous control environments now use mujoco-py >= 1.5. diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index 60bbe3746..cef4d6b97 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -148,6 +148,14 @@ class PusherEnv(MujocoEnv, utils.EzPickle): | `reward_control_weight` | **float** | `0.1` | Weight for *reward_control* term (see section on reward) | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Added `xml_file` argument. + - Added `reward_near_weight`, `reward_dist_weight`, `reward_control_weight` arguments, to configure the reward function (defaults are effectively the same as in `v4`). + - Fixed `info["reward_ctrl"]` being not being multiplied by the reward weight. + - Added `info["reward_near"]` which is equal to the reward term `reward_near`. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. * v3: This environment does not have a v3 release. * v2: All continuous control environments now use mujoco-py >= 1.50. diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index 05ad477e2..844bab5ab 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -127,6 +127,14 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): | `reward_control_weight` | **float** | `0.1` | Weight for *reward_control* term (see section on reward) | ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Removed `"z - position_fingertip"` from the observation space since it is always 0, and therefore provides no useful information to the agent, this should result is slightly faster training (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/204)). + - Added `xml_file` argument. + - Added `reward_dist_weight`, `reward_control_weight` arguments, to configure the reward function (defaults are effectively the same as in `v4`). + - Fixed `info["reward_ctrl"]` being not being multiplied by the reward weight. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 * v3: This environment does not have a v3 release. * v2: All continuous control environments now use mujoco-py >= 1.50 diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 7803dcfac..35d73b1a0 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -134,6 +134,18 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - Restored the `xml_file` argument (was removed in `v4`). + - Added `forward_reward_weight`, `ctrl_cost_weight`, to configure the reward function (defaults are effectively the same as in `v4`). + - Added `reset_noise_scale` argument to set the range of initial states. + - Added `exclude_current_positions_from_observation` argument. + - Replaced `info["reward_fwd"]` and `info["forward_reward"]` with `info["reward_forward"]` to be consistent with the other environments. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3. * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen). * v2: All continuous control environments now use mujoco-py >= 1.50. diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index 84b21fb63..9ae8d3ae2 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -159,6 +159,18 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): ## Version History + * v5: + - Minimum `mujoco` version is now 2.3.3. + - Added support for fully custom/third party `mujoco` models using the `xml_file` argument (previously only a few changes could be made to the existing models). + - Added `default_camera_config` argument, a dictionary for setting the `mj_camera` properties, mainly useful for custom environments. + - Added `env.observation_structure`, a dictionary for specifying the observation space compose (e.g. `qpos`, `qvel`), useful for building tooling and wrappers for the MuJoCo environments. + - Return a non-empty `info` with `reset()`, previously an empty dictionary was returned, the new keys are the same state information as `step()`. + - Added `frame_skip` argument, used to configure the `dt` (duration of `step()`), default varies by environment check environment documentation pages. + - In v2, v3 and v4 the models have different friction values for the two feet (left foot friction == 1.9 and right foot friction == 0.9). The `Walker-v5` model is updated to have the same friction for both feet (set to 1.9). This causes the Walker2d's the right foot to slide less on the surface and therefore require more force to move (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/477)). + - Fixed bug: `healthy_reward` was given on every step (even if the Walker2D is unhealthy), now it is only given if the Walker2d is healthy. The `info` "reward_survive" is updated with this change (related [Github issue](https://github.com/Farama-Foundation/Gymnasium/issues/526)). + - Restored the `xml_file` argument (was removed in `v4`). + - Added individual reward terms in `info` (`info["reward_forward"]`, info`["reward_ctrl"]`, `info["reward_survive"]`). + - Added `info["z_distance_from_origin"]` which is equal to the vertical distance of the "torso" body from its initial position. * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3 * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen) * v2: All continuous control environments now use mujoco-py >= 1.50 From ff65f7d0a5be59bd19f91a6976fd7203b71ab517 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Mon, 17 Jul 2023 14:11:06 +0300 Subject: [PATCH 51/53] typo fix --- gymnasium/envs/mujoco/reacher_v5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index 844bab5ab..f0491e669 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -9,7 +9,7 @@ from gymnasium.spaces import Box -DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0.0} +DEFAULT_CAMERA_CONFIG = {"trackbodyid": 0} class ReacherEnv(MujocoEnv, utils.EzPickle): From 3bd92630997862cc349e29ab0cace44356141350 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Tue, 18 Jul 2023 23:17:09 +0300 Subject: [PATCH 52/53] `ant` fix root obs name in DOCs --- gymnasium/envs/mujoco/ant_v5.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index f44532c9f..5cb7d74ca 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -71,11 +71,11 @@ class AntEnv(MujocoEnv, utils.EzPickle): | Num | Observation | Min | Max | Name (in corresponding XML file) | Joint | Type (Unit) | |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------| - | 0 | z-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | 1 | x-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 2 | y-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 3 | z-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | - | 4 | w-orientation of the torso (centre) | -Inf | Inf | torso | free | angle (rad) | + | 0 | z-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | 1 | x-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 2 | y-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 3 | z-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | + | 4 | w-orientation of the torso (centre) | -Inf | Inf | root | free | angle (rad) | | 5 | angle between torso and first link on front left | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | | 6 | angle between the two links on the front left | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | | 7 | angle between torso and first link on front right | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | @@ -84,12 +84,12 @@ class AntEnv(MujocoEnv, utils.EzPickle): | 10 | angle between the two links on the back left | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | | 11 | angle between torso and first link on back right | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | | 12 | angle between the two links on the back right | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | 13 | x-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 14 | y-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 15 | z-coordinate velocity of the torso | -Inf | Inf | torso | free | velocity (m/s) | - | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | - | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | torso | free | angular velocity (rad/s) | + | 13 | x-coordinate velocity of the torso | -Inf | Inf | root | free | velocity (m/s) | + | 14 | y-coordinate velocity of the torso | -Inf | Inf | root | free | velocity (m/s) | + | 15 | z-coordinate velocity of the torso | -Inf | Inf | root | free | velocity (m/s) | + | 16 | x-coordinate angular velocity of the torso | -Inf | Inf | root | free | angular velocity (rad/s) | + | 17 | y-coordinate angular velocity of the torso | -Inf | Inf | root | free | angular velocity (rad/s) | + | 18 | z-coordinate angular velocity of the torso | -Inf | Inf | root | free | angular velocity (rad/s) | | 19 | angular velocity of angle between torso and front left link | -Inf | Inf | hip_1 (front_left_leg) | hinge | angle (rad) | | 20 | angular velocity of the angle between front left links | -Inf | Inf | ankle_1 (front_left_leg) | hinge | angle (rad) | | 21 | angular velocity of angle between torso and front right link | -Inf | Inf | hip_2 (front_right_leg) | hinge | angle (rad) | @@ -98,8 +98,8 @@ class AntEnv(MujocoEnv, utils.EzPickle): | 24 | angular velocity of the angle between back left links | -Inf | Inf | ankle_3 (back_leg) | hinge | angle (rad) | | 25 | angular velocity of angle between torso and back right link | -Inf | Inf | hip_4 (right_back_leg) | hinge | angle (rad) | | 26 | angular velocity of the angle between back right links | -Inf | Inf | ankle_4 (right_back_leg) | hinge | angle (rad) | - | excluded | x-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | - | excluded | y-coordinate of the torso (centre) | -Inf | Inf | torso | free | position (m) | + | excluded | x-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | + | excluded | y-coordinate of the torso (centre) | -Inf | Inf | root | free | position (m) | The body parts are: From f559bb3e390858afc6b697f9cf7f68aa448837c5 Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas Date: Sun, 23 Jul 2023 13:53:41 +0300 Subject: [PATCH 53/53] @pseudo-rnd-thoughts 3rd review comments resolve --- gymnasium/envs/mujoco/ant_v5.py | 8 ++++---- gymnasium/envs/mujoco/half_cheetah_v5.py | 6 +++--- gymnasium/envs/mujoco/hopper_v5.py | 6 +++--- gymnasium/envs/mujoco/humanoid_v5.py | 20 +++++++++---------- gymnasium/envs/mujoco/humanoidstandup_v5.py | 20 +++++++++---------- .../mujoco/inverted_double_pendulum_v5.py | 2 +- gymnasium/envs/mujoco/inverted_pendulum_v5.py | 2 +- gymnasium/envs/mujoco/pusher_v5.py | 2 +- gymnasium/envs/mujoco/reacher_v5.py | 2 +- gymnasium/envs/mujoco/swimmer_v5.py | 2 +- gymnasium/envs/mujoco/walker2d_v5.py | 8 +++----- 11 files changed, 38 insertions(+), 40 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 5cb7d74ca..d4f3435d5 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -55,10 +55,10 @@ class AntEnv(MujocoEnv, utils.EzPickle): ## Observation Space The observation Space consists of the following parts (in order): - - qpos:* Position values of the robots's body parts. - - qvel:* The velocities of these individual body parts, + - qpos (13 elements by default):* Position values of the robots's body parts. + - qvel (14 elements):* The velocities of these individual body parts, (their derivatives). - - *cfrc_ext:* This is the center of mass based external forces on the body parts. + - *cfrc_ext (78 elements):* This is the center of mass based external forces on the body parts. It has shape 13 * 6 (*nbody * 6*) and hence adds to another 78 elements in the state space. (external forces - force x, y, z and torque x, y, z) @@ -179,7 +179,7 @@ class AntEnv(MujocoEnv, utils.EzPickle): 2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0]) #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py index 8fa3319fe..7ded23ec3 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v5.py +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -56,8 +56,8 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): ## Observation Space The observation Space consists of the following parts (in order): - - qpos:* Position values of the robots's body parts. - - qvel:* The velocities of these individual body parts, + - qpos (8 elements by default):* Position values of the robots's body parts. + - qvel (9 elements):* The velocities of these individual body parts, (their derivatives). By default, the observation does not include the robot's x-coordinate (`rootx`). @@ -120,7 +120,7 @@ class HalfCheetahEnv(MujocoEnv, utils.EzPickle): The Half Cheetah never terminates. #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index 35d9edf56..c058c3de4 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -54,8 +54,8 @@ class HopperEnv(MujocoEnv, utils.EzPickle): ## Observation Space The observation Space consists of the following parts (in order): - - qpos:* Position values of the robots's body parts. - - qvel:* The velocities of these individual body parts, + - qpos (5 elements by default):* Position values of the robots's body parts. + - qvel (6 elements):* The velocities of these individual body parts, (their derivatives). By default, observations do not include the x-coordinate of the hopper. It may @@ -130,7 +130,7 @@ class HopperEnv(MujocoEnv, utils.EzPickle): 3. The angle of the torso (`observation[1]` if `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range` #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index e3359562d..778ac23e0 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -72,19 +72,19 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): ## Observation Space The observation space consists of the following parts (in order) - - qpos:* The position values of the robot's body parts. - - qvel:* The velocities of these individual body parts, + - qpos (22 elements by default):* The position values of the robot's body parts. + - qvel (23 elements):* The velocities of these individual body parts, (their derivatives). - - *cinert:* Mass and inertia of the rigid body parts relative to the center of mass, + - *cinert (130 elements):* Mass and inertia of the rigid body parts relative to the center of mass, (this is an intermediate result of the transition). - It has shape 13*10 (*nbody * 10*) and thus adds another 130 elements to the observation space. + It has shape 13*10 (*nbody * 10*). (cinert - inertia matrix and body mass offset and body mass) - - *cvel:* Center of mass based velocity. - It has shape 13 * 6 (*nbody * 6*) and thus adds another 78 elements to the observation space. + - *cvel (78 elements):* Center of mass based velocity. + It has shape 13 * 6 (*nbody * 6*). (com velocity - velocity x, y, z and angular velocity x, y, z) - - *qfrc_actuator:* Constraint force generated as the actuator force at each joint. - This has shape `(17,)` *(nv * 1)* and thus adds another 17 elements to the observation space. - - *cfrc_ext:* This is the center of mass based external force on the body parts. + - *qfrc_actuator (17 elements):* Constraint force generated as the actuator force at each joint. + This has shape `(17,)` *(nv * 1)*. + - *cfrc_ext (78 elements):* This is the center of mass based external force on the body parts. It has shape 13 * 6 (*nbody * 6*) and thus adds to another 78 elements in the observation space. (external forces - force x, y, z and torque x, y, z) @@ -253,7 +253,7 @@ class HumanoidEnv(MujocoEnv, utils.EzPickle): 1. The z-position of the torso (the height) is no longer contained in `healthy_z_range`. #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index 34c2f62f2..fe6330560 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -66,19 +66,19 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): ## Observation Space The observation space consists of the following parts (in order) - - qpos:* The position values of the robot's body parts. - - qvel:* The velocities of these individual body parts, + - qpos (22 elements by default):* The position values of the robot's body parts. + - qvel (23 elements):* The velocities of these individual body parts, (their derivatives). - - *cinert:* Mass and inertia of the rigid body parts relative to the center of mass, + - *cinert (130 elements):* Mass and inertia of the rigid body parts relative to the center of mass, (this is an intermediate result of the transition). - It has shape 13*10 (*nbody * 10*) and thus adds another 130 elements to the observation space. + It has shape 13*10 (*nbody * 10*). (cinert - inertia matrix and body mass offset and body mass) - - *cvel:* Center of mass based velocity. - It has shape 13 * 6 (*nbody * 6*) and thus adds another 78 elements to the observation space. + - *cvel (78 elements):* Center of mass based velocity. + It has shape 13 * 6 (*nbody * 6*). (com velocity - velocity x, y, z and angular velocity x, y, z) - - *qfrc_actuator:* Constraint force generated as the actuator force at each joint. - This has shape `(17,)` *(nv * 1)* and thus adds another 17 elements to the observation space. - - *cfrc_ext:* This is the center of mass based external force on the body parts. + - *qfrc_actuator (17 elements):* Constraint force generated as the actuator force at each joint. + This has shape `(17,)` *(nv * 1)*. + - *cfrc_ext (78 elements):* This is the center of mass based external force on the body parts. It has shape 13 * 6 (*nbody * 6*) and thus adds to another 78 elements in the observation space. (external forces - force x, y, z and torque x, y, z) @@ -241,7 +241,7 @@ class HumanoidStandupEnv(MujocoEnv, utils.EzPickle): The Humanoid never terminates. #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py index 64389404d..dfa190ca2 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -115,7 +115,7 @@ class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle): Note: The maximum standing height of the system is 1.2 m when all the parts are perpendicularly vertical on top of each other. #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/inverted_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_pendulum_v5.py index 50962730c..6d519c90b 100644 --- a/gymnasium/envs/mujoco/inverted_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_pendulum_v5.py @@ -94,7 +94,7 @@ class InvertedPendulumEnv(MujocoEnv, utils.EzPickle): 2. The absolute value of the vertical angle between the pole and the cart is greater than 0.2 radian. #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index cef4d6b97..c1be462ce 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -128,7 +128,7 @@ class PusherEnv(MujocoEnv, utils.EzPickle): The Pusher never terminates. #### Truncation - The maximum duration of an episode is 100 timesteps. + The default duration of an episode is 100 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index f0491e669..6b0bdd630 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -108,7 +108,7 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): The Reacher never terminates. #### Truncation - The maximum duration of an episode is 50 timesteps. + The default duration of an episode is 50 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index 35d73b1a0..df727261e 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -112,7 +112,7 @@ class SwimmerEnv(MujocoEnv, utils.EzPickle): The Swimmer never terminates. #### Truncation - The maximum duration of an episode is 1000 timesteps. + The default duration of an episode is 1000 timesteps ## Arguments diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index 9ae8d3ae2..aa43a9875 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -58,8 +58,8 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): ## Observation Space The observation Space consists of the following parts (in order): - - qpos:* Position values of the robots's body parts. - - qvel:* The velocities of these individual body parts, + - qpos (8 elements by default):* Position values of the robots's body parts. + - qvel (9 elements):* The velocities of these individual body parts, (their derivatives). By default, the observation does not include the robot's x-coordinate (`rootx`). @@ -131,9 +131,7 @@ class Walker2dEnv(MujocoEnv, utils.EzPickle): 3. The absolute value of the angle (`observation[1]` if `exclude_current_positions_from_observation=False`, else `observation[2]`) is ***not*** in the closed interval specified by `healthy_angle_range` #### Truncation - the maximum duration of an episode is 1000 timesteps. - - If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded. + The default duration of an episode is 1000 timesteps ## Arguments