polish(pu): polish action_type and env_type, fix test.yml, fix unitte…

…st (#160) * polish(pu): polish action_type and env_type for legal_action preprocessing * polish(pu): use the latest clang in test * fix(pu): use the latest clang in test on macOS * fix(pu): use the latest clang in test on macOS * fix(pu): use clang as c compiler in test on macOS * test(pu): add test case of lunarlander_disc_gumbel_muzero_config when action_type is'varied_action_space' * fix(pu): fix unittest * polish(pu): undo wrongly modifications in cartpole and lunarlander * fix(pu): fix requirements.txt and typo in ucb_score of ptree_az * fix(pu): fix unittest in test_ding_env_wrapper
opendilab · Dec 7, 2023 · bbf371e · bbf371e
1 parent b6fd371
commit bbf371e
Show file tree

Hide file tree

Showing 43 changed files with 123 additions and 38 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -49,7 +49,13 @@ jobs:
         shell: bash
         run: |
           brew install tree cloc wget curl make zip graphviz
+          brew install llvm  # Install llvm (which includes clang)
+          echo 'export PATH="/usr/local/opt/llvm/bin:$PATH"' >> $GITHUB_ENV  # update PATH
           dot -V
+      - name: Set CC and CXX variables
+        run: |
+          echo "CC=$(which clang)" >> $GITHUB_ENV
+          echo "CXX=$(which clang++)" >> $GITHUB_ENV
       - name: Set up python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
         with:

diff --git a/lzero/envs/tests/test_ding_env_wrapper.py b/lzero/envs/tests/test_ding_env_wrapper.py
@@ -26,6 +26,6 @@ def test(self):
 
         obs = ding_env.reset()
 
-        assert isinstance(obs, np.ndarray)
+        assert isinstance(obs[0], np.ndarray)
         action = ding_env.random_action()
         print('random_action: {}, action_space: {}'.format(action.shape, ding_env.action_space))
diff --git a/lzero/mcts/buffer/game_buffer.py b/lzero/mcts/buffer/game_buffer.py
@@ -53,6 +53,8 @@ def __init__(self, cfg: dict):
         self._cfg = default_config
         self._cfg = cfg
         assert self._cfg.env_type in ['not_board_games', 'board_games']
+        assert self._cfg.action_type in ['fixed_action_space', 'varied_action_space']
+
         self.replay_buffer_size = self._cfg.replay_buffer_size
         self.batch_size = self._cfg.batch_size
         self._alpha = self._cfg.priority_prob_alpha

diff --git a/lzero/mcts/buffer/game_buffer_efficientzero.py b/lzero/mcts/buffer/game_buffer_efficientzero.py
@@ -30,6 +30,7 @@ def __init__(self, cfg: dict):
         default_config.update(cfg)
         self._cfg = default_config
         assert self._cfg.env_type in ['not_board_games', 'board_games']
+        assert self._cfg.action_type in ['fixed_action_space', 'varied_action_space']
         self.replay_buffer_size = self._cfg.replay_buffer_size
         self.batch_size = self._cfg.batch_size
         self._alpha = self._cfg.priority_prob_alpha
@@ -406,7 +407,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                         else:
                             if self._cfg.mcts_ctree:
                                 # cpp mcts_tree
-                                if self._cfg.env_type == 'not_board_games':
+                                if self._cfg.action_type == 'fixed_action_space':
                                     sum_visits = sum(distributions)
                                     policy = [visit_count / sum_visits for visit_count in distributions]
                                     target_policies.append(policy)
@@ -421,7 +422,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                                     target_policies.append(policy_tmp)
                             else:
                                 # python mcts_tree
-                                if self._cfg.env_type == 'not_board_games':
+                                if self._cfg.action_type == 'fixed_action_space':
                                     sum_visits = sum(distributions)
                                     policy = [visit_count / sum_visits for visit_count in distributions]
                                     target_policies.append(policy)

diff --git a/lzero/mcts/buffer/game_buffer_gumbel_muzero.py b/lzero/mcts/buffer/game_buffer_gumbel_muzero.py
@@ -1,10 +1,11 @@
-from typing import Any, List, Tuple, Union, TYPE_CHECKING, Optional
+from typing import Any, Tuple
 
 import numpy as np
 from ding.utils import BUFFER_REGISTRY
 
-from lzero.mcts.utils import prepare_observation
 from lzero.mcts.buffer import MuZeroGameBuffer
+from lzero.mcts.utils import prepare_observation
+
 
 @BUFFER_REGISTRY.register('game_buffer_gumbel_muzero')
 class GumbelMuZeroGameBuffer(MuZeroGameBuffer):

diff --git a/lzero/mcts/buffer/game_buffer_muzero.py b/lzero/mcts/buffer/game_buffer_muzero.py
@@ -33,6 +33,7 @@ def __init__(self, cfg: dict):
         default_config.update(cfg)
         self._cfg = default_config
         assert self._cfg.env_type in ['not_board_games', 'board_games']
+        assert self._cfg.action_type in ['fixed_action_space', 'varied_action_space']
         self.replay_buffer_size = self._cfg.replay_buffer_size
         self.batch_size = self._cfg.batch_size
         self._alpha = self._cfg.priority_prob_alpha
@@ -497,7 +498,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
 
         # for board games
         policy_obs_list, policy_mask, pos_in_game_segment_list, batch_index_list, child_visits, game_segment_lens, action_mask_segment, \
-        to_play_segment = policy_re_context  # noqa
+        to_play_segment = policy_re_context
         # transition_batch_size = game_segment_batch_size * (self._cfg.num_unroll_steps + 1)
         transition_batch_size = len(policy_obs_list)
         game_segment_batch_size = len(pos_in_game_segment_list)
@@ -579,7 +580,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                                 list(np.ones(self._cfg.model.action_space_size) / self._cfg.model.action_space_size)
                             )
                         else:
-                            if self._cfg.env_type == 'not_board_games':
+                            if self._cfg.action_type == 'fixed_action_space':
                                 # for atari/classic_control/box2d environments that only have one player.
                                 sum_visits = sum(distributions)
                                 policy = [visit_count / sum_visits for visit_count in distributions]
@@ -657,7 +658,7 @@ def _compute_target_policy_non_reanalyzed(
                         policy_mask.append(1)
                         # NOTE: child_visit is already a distribution
                         distributions = child_visit[current_index]
-                        if self._cfg.env_type == 'not_board_games':
+                        if self._cfg.action_type == 'fixed_action_space':
                             # for atari/classic_control/box2d environments that only have one player.
                             target_policies.append(distributions)
                         else:

diff --git a/lzero/mcts/buffer/game_buffer_sampled_efficientzero.py b/lzero/mcts/buffer/game_buffer_sampled_efficientzero.py
@@ -30,6 +30,7 @@ def __init__(self, cfg: dict):
         default_config.update(cfg)
         self._cfg = default_config
         assert self._cfg.env_type in ['not_board_games', 'board_games']
+        assert self._cfg.action_type in ['fixed_action_space', 'varied_action_space']
         self.replay_buffer_size = self._cfg.replay_buffer_size
         self.batch_size = self._cfg.batch_size
         self._alpha = self._cfg.priority_prob_alpha
@@ -540,7 +541,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                                 )
                             )
                         else:
-                            if self._cfg.env_type == 'not_board_games':
+                            if self._cfg.action_type == 'fixed_action_space':
                                 sum_visits = sum(distributions)
                                 policy = [visit_count / sum_visits for visit_count in distributions]
                                 target_policies.append(policy)

diff --git a/lzero/mcts/buffer/game_buffer_stochastic_muzero.py b/lzero/mcts/buffer/game_buffer_stochastic_muzero.py
@@ -26,6 +26,7 @@ def __init__(self, cfg: dict):
         default_config.update(cfg)
         self._cfg = default_config
         assert self._cfg.env_type in ['not_board_games', 'board_games']
+        assert self._cfg.action_type in ['fixed_action_space', 'varied_action_space']
         self.replay_buffer_size = self._cfg.replay_buffer_size
         self.batch_size = self._cfg.batch_size
         self._alpha = self._cfg.priority_prob_alpha

diff --git a/lzero/mcts/ctree/ctree_sampled_efficientzero/lib/cnode.cpp b/lzero/mcts/ctree/ctree_sampled_efficientzero/lib/cnode.cpp
@@ -380,12 +380,14 @@ namespace tree
             // After sorting, the first vector is the index, and the second vector is the probability value after perturbation sorted from large to small.
             for (size_t iter = 0; iter < disturbed_probs.size(); iter++)
             {
-//                #ifdef __APPLE__
-//                    disc_action_with_probs.__emplace_back(std::make_pair(iter, disturbed_probs[iter]));
-//                #else
-//                    disc_action_with_probs.emplace_back(std::make_pair(iter, disturbed_probs[iter]));
-//                #endif
-                 disc_action_with_probs.emplace_back(std::make_pair(iter, disturbed_probs[iter]));
+
+            #ifdef __GNUC__
+                // Use push_back for GCC
+                disc_action_with_probs.push_back(std::make_pair(iter, disturbed_probs[iter]));
+            #else
+                // Use emplace_back for other compilers
+                disc_action_with_probs.emplace_back(std::make_pair(iter, disturbed_probs[iter]));
+            #endif
             }
 
             std::sort(disc_action_with_probs.begin(), disc_action_with_probs.end(), cmp);

diff --git a/lzero/mcts/ptree/ptree_az.py b/lzero/mcts/ptree/ptree_az.py
@@ -402,7 +402,7 @@ def _ucb_score(self, parent: Node, child: Node) -> float:
         Overview:
             Compute UCB score. The score for a node is based on its value, plus an exploration bonus based on the prior.
             For more details, please refer to this paper: http://gauss.ececs.uc.edu/Workshops/isaim2010/papers/rosin.pdf
-            UCB = Q(s,a) + P(s,a) \cdot \frac{N(\text{parent})}{1+N(\text{child})} \cdot \left(c_1 + \log\left(\frac{N(\text{parent})+c_2+1}{c_2}\right)\right)
+            UCB = Q(s,a) + P(s,a) \cdot \frac{ \sqrt{N(\text{parent})}}{1+N(\text{child})} \cdot \left(c_1 + \log\left(\frac{N(\text{parent})+c_2+1}{c_2}\right)\right)
             - Q(s,a): value of a child node.
             - P(s,a): The prior of a child node.
             - N(parent): The number of the visiting of the parent node.

diff --git a/lzero/mcts/tests/config/tictactoe_muzero_bot_mode_config_for_test.py b/lzero/mcts/tests/config/tictactoe_muzero_bot_mode_config_for_test.py
@@ -32,6 +32,9 @@
         channel_last=True,
         scale=True,
         stop_value=1,
+        alphazero_mcts_ctree=False,
+        save_replay_gif=False,
+        replay_path_gif='./replay_gif',
     ),
     policy=dict(
         sampled_algo=False,

diff --git a/lzero/mcts/tests/test_game_buffer.py b/lzero/mcts/tests/test_game_buffer.py
@@ -14,6 +14,7 @@
         replay_buffer_size=10000,
         env_type='not_board_games',
         use_priority=True,
+        action_type='fixed_action_space',
     )
 )
 

diff --git a/lzero/policy/efficientzero.py b/lzero/policy/efficientzero.py
@@ -77,6 +77,8 @@ class EfficientZeroPolicy(MuZeroPolicy):
         evaluator_env_num=3,
         # (str) The type of environment. The options are ['not_board_games', 'board_games'].
         env_type='not_board_games',
+        # (str) The type of action space. Options are ['fixed_action_space', 'varied_action_space'].
+        action_type='fixed_action_space',
         # (str) The type of battle mode. The options are ['play_with_bot_mode', 'self_play_mode'].
         battle_mode='play_with_bot_mode',
         # (bool) Whether to monitor extra statistics in tensorboard.

diff --git a/lzero/policy/gumbel_muzero.py b/lzero/policy/gumbel_muzero.py
@@ -78,6 +78,8 @@ class GumbelMuZeroPolicy(MuZeroPolicy):
         evaluator_env_num=3,
         # (str) The type of environment. Options is ['not_board_games', 'board_games'].
         env_type='not_board_games',
+        # (str) The type of action space. Options are ['fixed_action_space', 'varied_action_space'].
+        action_type='fixed_action_space',
         # (str) The type of battle mode. Options is ['play_with_bot_mode', 'self_play_mode'].
         battle_mode='play_with_bot_mode',
         # (bool) Whether to monitor extra statistics in tensorboard.

diff --git a/lzero/policy/muzero.py b/lzero/policy/muzero.py
@@ -80,6 +80,8 @@ class MuZeroPolicy(Policy):
         evaluator_env_num=3,
         # (str) The type of environment. Options are ['not_board_games', 'board_games'].
         env_type='not_board_games',
+        # (str) The type of action space. Options are ['fixed_action_space', 'varied_action_space'].
+        action_type='fixed_action_space',
         # (str) The type of battle mode. Options are ['play_with_bot_mode', 'self_play_mode'].
         battle_mode='play_with_bot_mode',
         # (bool) Whether to monitor extra statistics in tensorboard.

diff --git a/lzero/policy/sampled_efficientzero.py b/lzero/policy/sampled_efficientzero.py
@@ -84,6 +84,8 @@ class SampledEfficientZeroPolicy(MuZeroPolicy):
         evaluator_env_num=3,
         # (str) The type of environment. The options are ['not_board_games', 'board_games'].
         env_type='not_board_games',
+        # (str) The type of action space. Options are ['fixed_action_space', 'varied_action_space'].
+        action_type='fixed_action_space',
         # (str) The type of battle mode. The options are ['play_with_bot_mode', 'self_play_mode'].
         battle_mode='play_with_bot_mode',
         # (bool) Whether to monitor extra statistics in tensorboard.

diff --git a/lzero/policy/stochastic_muzero.py b/lzero/policy/stochastic_muzero.py
@@ -71,6 +71,8 @@ class StochasticMuZeroPolicy(MuZeroPolicy):
         evaluator_env_num=3,
         # (str) The type of environment. Options is ['not_board_games', 'board_games'].
         env_type='not_board_games',
+        # (str) The type of action space. Options are ['fixed_action_space', 'varied_action_space'].
+        action_type='fixed_action_space',
         # (str) The type of battle mode. Options is ['play_with_bot_mode', 'self_play_mode'].
         battle_mode='play_with_bot_mode',
         # (bool) Whether to monitor extra statistics in tensorboard.

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
-DI-engine[common_env]>=0.4.7
-gym[accept-rom-license]==0.25.1
+DI-engine>=0.4.7
 gymnasium[atari]
 numpy>=1.22.4
 pympler

diff --git a/zoo/board_games/connect4/config/connect4_alphazero_bot_mode_config.py b/zoo/board_games/connect4/config/connect4_alphazero_bot_mode_config.py
@@ -34,6 +34,7 @@
         scale=True,
         screen_scaling=9,
         render_mode=None,
+        replay_path=None,
         alphazero_mcts_ctree=mcts_ctree,
         # ==============================================================
     ),
@@ -52,6 +53,7 @@
         ),
         cuda=True,
         env_type='board_games',
+        action_type='varied_action_space',
         update_per_collect=update_per_collect,
         batch_size=batch_size,
         optim_type='Adam',

diff --git a/zoo/board_games/connect4/config/connect4_alphazero_sp_mode_config.py b/zoo/board_games/connect4/config/connect4_alphazero_sp_mode_config.py
@@ -11,6 +11,8 @@
 batch_size = 256
 max_env_step = int(1e6)
 model_path = None
+mcts_ctree = False
+
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
@@ -19,12 +21,31 @@
     env=dict(
         battle_mode='self_play_mode',
         bot_action_type='rule',
+        channel_last=False,
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,
         n_evaluator_episode=evaluator_env_num,
         manager=dict(shared_memory=False, ),
+        # ==============================================================
+        # for the creation of simulation env
+        agent_vs_human=False,
+        prob_random_agent=0,
+        prob_expert_agent=0,
+        prob_random_action_in_bot=0,
+        scale=True,
+        screen_scaling=9,
+        render_mode=None,
+        replay_path=None,
+        alphazero_mcts_ctree=mcts_ctree,
+        # ==============================================================
     ),
     policy=dict(
+        mcts_ctree=mcts_ctree,
+        # ==============================================================
+        # for the creation of simulation env
+        simulation_env_name='connect4',
+        simulation_env_config_type='self_play',
+        # ==============================================================
         model=dict(
             observation_shape=(3, 6, 7),
             action_space_size=7,
@@ -33,6 +54,7 @@
         ),
         cuda=True,
         env_type='board_games',
+        action_type='varied_action_space',
         update_per_collect=update_per_collect,
         batch_size=batch_size,
         optim_type='Adam',

diff --git a/zoo/board_games/connect4/config/connect4_muzero_bot_mode_config.py b/zoo/board_games/connect4/config/connect4_muzero_bot_mode_config.py
@@ -40,6 +40,7 @@
         ),
         cuda=True,
         env_type='board_games',
+        action_type='varied_action_space',
         game_segment_length=int(6 * 7 / 2),  # for battle_mode='play_with_bot_mode'
         update_per_collect=update_per_collect,
         batch_size=batch_size,

diff --git a/zoo/board_games/connect4/config/connect4_muzero_sp_mode_config.py b/zoo/board_games/connect4/config/connect4_muzero_sp_mode_config.py
@@ -40,6 +40,7 @@
         ),
         cuda=True,
         env_type='board_games',
+        action_type='varied_action_space',
         game_segment_length=int(6 * 7),  # for battle_mode='self_play_mode'
         update_per_collect=update_per_collect,
         batch_size=batch_size,

diff --git a/zoo/board_games/gomoku/config/gomoku_alphazero_bot_mode_config.py b/zoo/board_games/gomoku/config/gomoku_alphazero_bot_mode_config.py
@@ -12,7 +12,7 @@
 batch_size = 256
 max_env_step = int(5e5)
 prob_random_action_in_bot = 0.5
-mcts_ctree = True
+mcts_ctree = False
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
@@ -37,6 +37,7 @@
         scale=True,
         screen_scaling=9,
         render_mode=None,
+        replay_path=None,
         alphazero_mcts_ctree=mcts_ctree,
         # ==============================================================
     ),

diff --git a/zoo/board_games/gomoku/config/gomoku_alphazero_sp_mode_config.py b/zoo/board_games/gomoku/config/gomoku_alphazero_sp_mode_config.py
@@ -37,6 +37,7 @@
         scale=True,
         screen_scaling=9,
         render_mode=None,
+        replay_path=None,
         alphazero_mcts_ctree=mcts_ctree,
         # ==============================================================
     ),

diff --git a/zoo/board_games/gomoku/config/gomoku_gumbel_muzero_bot_mode_config.py b/zoo/board_games/gomoku/config/gomoku_gumbel_muzero_bot_mode_config.py
@@ -46,6 +46,7 @@
         ),
         cuda=True,
         env_type='board_games',
+        action_type='varied_action_space',
         game_segment_length=int(board_size * board_size / 2),  # for battle_mode='play_with_bot_mode'
         update_per_collect=update_per_collect,
         batch_size=batch_size,

diff --git a/zoo/board_games/gomoku/config/gomoku_muzero_bot_mode_config.py b/zoo/board_games/gomoku/config/gomoku_muzero_bot_mode_config.py
@@ -46,6 +46,7 @@
         ),
         cuda=True,
         env_type='board_games',
+        action_type='varied_action_space',
         game_segment_length=int(board_size * board_size / 2),  # for battle_mode='play_with_bot_mode'
         update_per_collect=update_per_collect,
         batch_size=batch_size,
@@ -85,5 +86,4 @@
 
 if __name__ == "__main__":
     from lzero.entry import train_muzero
-
     train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/board_games/gomoku/config/gomoku_muzero_sp_mode_config.py b/zoo/board_games/gomoku/config/gomoku_muzero_sp_mode_config.py
@@ -45,6 +45,7 @@
         ),
         cuda=True,
         env_type='board_games',
+        action_type='varied_action_space',
         game_segment_length=int(board_size * board_size),  # for battle_mode='self_play_mode'
         update_per_collect=update_per_collect,
         batch_size=batch_size,

diff --git a/zoo/board_games/gomoku/config/gomoku_sampled_alphazero_bot_mode_config.py b/zoo/board_games/gomoku/config/gomoku_sampled_alphazero_bot_mode_config.py
@@ -43,10 +43,11 @@
         scale=True,
         check_action_to_connect4_in_bot_v0=False,
         simulation_env_name="gomoku",
-        # ==============================================================
-        mcts_ctree=mcts_ctree,
         screen_scaling=9,
         render_mode=None,
+        replay_path=None,
+        alphazero_mcts_ctree=mcts_ctree,
+        # ==============================================================
     ),
     policy=dict(
         # ==============================================================

diff --git a/zoo/board_games/gomoku/config/gomoku_sampled_alphazero_sp_mode_config.py b/zoo/board_games/gomoku/config/gomoku_sampled_alphazero_sp_mode_config.py
@@ -39,10 +39,11 @@
         scale=True,
         check_action_to_connect4_in_bot_v0=False,
         simulation_env_name="gomoku",
-        # ==============================================================
-        mcts_ctree=mcts_ctree,
         screen_scaling=9,
         render_mode=None,
+        replay_path=None,
+        alphazero_mcts_ctree=mcts_ctree,
+        # ==============================================================
     ),
     policy=dict(
         # ==============================================================