From 83bc5fb18c725590d0972d59d719d2184ba06535 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Wed, 15 Feb 2023 22:02:43 +0000
Subject: [PATCH 1/9] fix: fixed ray's error 'No module named aiohttp.signals'

---
 requirements.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.lock b/requirements.lock
index 742f745f..4d7ba441 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -6,7 +6,7 @@
 #
 absl-py==1.0.0
     # via tensorboard
-aiohttp==3.8.1
+aiohttp==3.7.4
     # via
     #   aiohttp-cors
     #   ray
@@ -16,7 +16,7 @@ aioredis==1.3.1
     # via ray
 aiosignal==1.2.0
     # via aiohttp
-async-timeout==4.0.1
+async-timeout==3.0.1
     # via
     #   aiohttp
     #   aioredis
@@ -171,7 +171,7 @@ pytz==2021.3
     # via pandas
 pyyaml==6.0
     # via ray
-ray==1.5.2
+ray==1.2
     # via -r requirements.in
 redis==4.0.1
     # via ray

From 55582a8eda97c59a55bdf35ede27d41b62a5fd49 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Fri, 11 Aug 2023 04:33:17 +0100
Subject: [PATCH 2/9] merge representation network and dynamic network

---
 .gitignore                                    |   4 +-
 models.py                                     |  47 +-
 muzero.py                                     |   4 +
 muzero_2net.py                                | 718 +++++++++++++++++
 self_play.py                                  | 127 ++-
 shared_storage.py                             |  10 +-
 simplifiedMuZero/RHEA.py                      |  12 +
 simplifiedMuZero/models2.py                   | 150 ++++
 simplifiedMuZero/models_2net.py               | 696 +++++++++++++++++
 .../models_without_replay_buffer.py           | 696 +++++++++++++++++
 .../muzero_without_replay_buffer.py           | 723 ++++++++++++++++++
 simplifiedMuZero/replay_buffer3.py            | 373 +++++++++
 simplifiedMuZero/self_play_2net.py            | 622 +++++++++++++++
 .../self_play_without_replay_buffer.py        | 624 +++++++++++++++
 simplifiedMuZero/trainer_2net.py              | 300 ++++++++
 .../trainer_without_replay_buffer.py          | 303 ++++++++
 test/Simple_grid_test.py                      |  23 +
 test/ray_test.py                              |  20 +
 18 files changed, 5388 insertions(+), 64 deletions(-)
 create mode 100644 muzero_2net.py
 create mode 100644 simplifiedMuZero/RHEA.py
 create mode 100644 simplifiedMuZero/models2.py
 create mode 100644 simplifiedMuZero/models_2net.py
 create mode 100644 simplifiedMuZero/models_without_replay_buffer.py
 create mode 100644 simplifiedMuZero/muzero_without_replay_buffer.py
 create mode 100644 simplifiedMuZero/replay_buffer3.py
 create mode 100644 simplifiedMuZero/self_play_2net.py
 create mode 100644 simplifiedMuZero/self_play_without_replay_buffer.py
 create mode 100644 simplifiedMuZero/trainer_2net.py
 create mode 100644 simplifiedMuZero/trainer_without_replay_buffer.py
 create mode 100644 test/Simple_grid_test.py
 create mode 100644 test/ray_test.py

diff --git a/.gitignore b/.gitignore
index f106bb6b..844f676b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,4 +90,6 @@ venv.bak/
 # mypy
 .mypy_cache/
 .dmypy.json
-dmypy.json
\ No newline at end of file
+dmypy.json
+
+results/
\ No newline at end of file
diff --git a/models.py b/models.py
index be847fef..d4b8bc2f 100644
--- a/models.py
+++ b/models.py
@@ -94,6 +94,7 @@ def __init__(
         super().__init__()
         self.action_space_size = action_space_size
         self.full_support_size = 2 * support_size + 1
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
 
         self.representation_network = torch.nn.DataParallel(
             mlp(
@@ -107,6 +108,7 @@ def __init__(
             )
         )
 
+        #dynamics的输入是encoding_size+action_space_size
         self.dynamics_encoded_state_network = torch.nn.DataParallel(
             mlp(
                 encoding_size + self.action_space_size,
@@ -115,14 +117,14 @@ def __init__(
             )
         )
         self.dynamics_reward_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_reward_layers, self.full_support_size)
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
         self.prediction_policy_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_policy_layers, self.action_space_size)
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
         )
         self.prediction_value_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_value_layers, self.full_support_size)
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
     def prediction(self, encoded_state):
@@ -134,16 +136,19 @@ def representation(self, observation):
         encoded_state = self.representation_network(
             observation.view(observation.shape[0], -1)
         )
+
+        # 正则化
         # Scale encoded state between [0, 1] (See appendix paper Training)
         min_encoded_state = encoded_state.min(1, keepdim=True)[0]
         max_encoded_state = encoded_state.max(1, keepdim=True)[0]
         scale_encoded_state = max_encoded_state - min_encoded_state
-        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
         encoded_state_normalized = (
             encoded_state - min_encoded_state
         ) / scale_encoded_state
         return encoded_state_normalized
 
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
     def dynamics(self, encoded_state, action):
         # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
         action_one_hot = (
@@ -151,18 +156,19 @@ def dynamics(self, encoded_state, action):
             .to(action.device)
             .float()
         )
-        action_one_hot.scatter_(1, action.long(), 1.0)
+        action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1
         x = torch.cat((encoded_state, action_one_hot), dim=1)
 
         next_encoded_state = self.dynamics_encoded_state_network(x)
 
         reward = self.dynamics_reward_network(next_encoded_state)
 
+        # 正则化
         # Scale encoded state between [0, 1] (See paper appendix Training)
         min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0]
         max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0]
         scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
-        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
         next_encoded_state_normalized = (
             next_encoded_state - min_next_encoded_state
         ) / scale_next_encoded_state
@@ -172,7 +178,7 @@ def dynamics(self, encoded_state, action):
     def initial_inference(self, observation):
         encoded_state = self.representation(observation)
         policy_logits, value = self.prediction(encoded_state)
-        # reward equal to 0 for consistency
+        # reward equal to 0 for consistency 一致性奖励等于 0
         reward = torch.log(
             (
                 torch.zeros(1, self.full_support_size)
@@ -181,6 +187,7 @@ def initial_inference(self, observation):
                 .to(observation.device)
             )
         )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
 
         return (
             value,
@@ -605,8 +612,8 @@ def initial_inference(self, observation):
         reward = torch.log(
             (
                 torch.zeros(1, self.full_support_size)
-                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
-                .repeat(len(observation), 1)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
                 .to(observation.device)
             )
         )
@@ -637,29 +644,29 @@ def mlp(
     sizes = [input_size] + layer_sizes + [output_size]
     layers = []
     for i in range(len(sizes) - 1):
-        act = activation if i < len(sizes) - 2 else output_activation
+        act = activation if i < len(sizes) - 2 else output_activation #激活函数，最后一层是output_activation，其余的都一样
         layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
     return torch.nn.Sequential(*layers)
 
 
-def support_to_scalar(logits, support_size):
+def support_to_scalar(logits, support_size): # logits 是 value的对数值，support_size是转换后的范围。
     """
     Transform a categorical representation to a scalar
     See paper appendix Network Architecture
     """
     # Decode to a scalar
-    probabilities = torch.softmax(logits, dim=1)
+    probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1，softmax扩大大的，缩小下的，shape为[stacked_size, fully_support_size]
     support = (
-        torch.tensor([x for x in range(-support_size, support_size + 1)])
+        torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
         .expand(probabilities.shape)
         .float()
         .to(device=probabilities.device)
-    )
-    x = torch.sum(support * probabilities, dim=1, keepdim=True)
+    ) # shape 为【stacked_size, fully_support_size】，
+    x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1，fully_support_size】，因为dim=1，另外keep_dim=True，所有是【1，fully_support_size】而不是【fully_support_size]
 
     # Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
-    x = torch.sign(x) * (
-        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001))
+    x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1，大于0为1，0为0。主要是获取x的符号
+        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
         ** 2
         - 1
     )
@@ -675,9 +682,9 @@ def scalar_to_support(x, support_size):
     x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x
 
     # Encode on a vector
-    x = torch.clamp(x, -support_size, support_size)
-    floor = x.floor()
-    prob = x - floor
+    x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围，使x的范围定为[-support_size, support_size]
+    floor = x.floor() # floor向下取整，类似的，ceil为向上取整
+    prob = x - floor # 减去整数，保留小数部分（因为在support_to_scala部分是index位置乘上概率)
     logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
     logits.scatter_(
         2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)
diff --git a/muzero.py b/muzero.py
index f7601c9b..3e075e96 100644
--- a/muzero.py
+++ b/muzero.py
@@ -43,6 +43,7 @@ def __init__(self, game_name, config=None, split_resources_in=1):
         # Load the game and the config from the module with the game name
         try:
             game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
             self.Game = game_module.Game
             self.config = game_module.MuZeroConfig()
         except ModuleNotFoundError as err:
@@ -671,7 +672,10 @@ def load_model_menu(muzero, game_name):
                 choice = input("Invalid input, enter a number listed above: ")
             choice = int(choice)
             if choice == 0:
+                start_time = time.time()
                 muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
             elif choice == 1:
                 load_model_menu(muzero, game_name)
             elif choice == 2:
diff --git a/muzero_2net.py b/muzero_2net.py
new file mode 100644
index 00000000..bfdc38b0
--- /dev/null
+++ b/muzero_2net.py
@@ -0,0 +1,718 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+sys.path.append("")
+
+import diagnose_model
+import simplifiedMuZero.models_2net as models
+import simplifiedMuZero.replay_buffer3 as replay_buffer
+import simplifiedMuZero.self_play_2net as self_play
+import shared_storage
+import simplifiedMuZero.trainer_2net as trainer
+
+
+class MuZero:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlay.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.SimplifiedMuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/self_play.py b/self_play.py
index d90fe5db..d09c5e87 100644
--- a/self_play.py
+++ b/self_play.py
@@ -33,8 +33,8 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
             shared_storage.get_info.remote("training_step")
         ) < self.config.training_steps and not ray.get(
             shared_storage.get_info.remote("terminate")
-        ):
-            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights")))
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
 
             if not test_mode:
                 game_history = self.play_game(
@@ -107,6 +107,16 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
 
         self.close_game()
 
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
     def play_game(
         self, temperature, temperature_threshold, render, opponent, muzero_player
     ):
@@ -116,7 +126,7 @@ def play_game(
         game_history = GameHistory()
         observation = self.game.reset()
         game_history.action_history.append(0)
-        game_history.observation_history.append(observation)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
         game_history.reward_history.append(0)
         game_history.to_play_history.append(self.game.to_play())
 
@@ -128,7 +138,7 @@ def play_game(
         with torch.no_grad():
             while (
                 not done and len(game_history.action_history) <= self.config.max_moves
-            ):
+            ): # 游戏没有结束且运行步数小于最大移动步长
                 assert (
                     len(numpy.array(observation).shape) == 3
                 ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
@@ -138,14 +148,17 @@ def play_game(
                 stacked_observations = game_history.get_stacked_observations(
                     -1, self.config.stacked_observations, len(self.config.action_space)
                 )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
 
+                # 一下的if-else部分主要是为了选择一个动作
                 # Choose the action
                 if opponent == "self" or muzero_player == self.game.to_play():
                     root, mcts_info = MCTS(self.config).run(
                         self.model,
                         stacked_observations,
                         self.game.legal_actions(),
-                        self.game.to_play(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
                         True,
                     )
                     action = self.select_action(
@@ -154,7 +167,7 @@ def play_game(
                         if not temperature_threshold
                         or len(game_history.action_history) < temperature_threshold
                         else 0,
-                    )
+                    ) # 根据temperature选择动作
 
                     if render:
                         print(f'Tree depth: {mcts_info["max_tree_depth"]}')
@@ -162,11 +175,11 @@ def play_game(
                             f"Root value for player {self.game.to_play()}: {root.value():.2f}"
                         )
                 else:
-                    action, root = self.select_opponent_action(
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
                         opponent, stacked_observations
                     )
 
-                observation, reward, done = self.game.step(action)
+                observation, reward, done = self.game.step(action) # 运行游戏
 
                 if render:
                     print(f"Played action: {self.game.action_to_string(action)}")
@@ -176,7 +189,7 @@ def play_game(
 
                 # Next batch
                 game_history.action_history.append(action)
-                game_history.observation_history.append(observation)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
                 game_history.reward_history.append(reward)
                 game_history.to_play_history.append(self.game.to_play())
 
@@ -219,7 +232,12 @@ def select_opponent_action(self, opponent, stacked_observations):
                 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
             )
 
-    @staticmethod
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
     def select_action(node, temperature):
         """
         Select action according to the visit count distribution and the temperature.
@@ -257,6 +275,25 @@ class MCTS:
     def __init__(self, config):
         self.config = config
 
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
     def run(
         self,
         model,
@@ -272,7 +309,7 @@ def run(
         We then run a Monte Carlo Tree Search using only action sequences and the model
         learned by the network.
         """
-        if override_root_with:
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
             root = override_root_with
             root_predicted_value = None
         else:
@@ -282,7 +319,7 @@ def run(
                 .float()
                 .unsqueeze(0)
                 .to(next(model.parameters()).device)
-            )
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
             (
                 root_predicted_value,
                 reward,
@@ -316,16 +353,17 @@ def run(
         min_max_stats = MinMaxStats()
 
         max_tree_depth = 0
-        for _ in range(self.config.num_simulations):
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
             virtual_to_play = to_play
             node = root
             search_path = [node]
             current_tree_depth = 0
 
-            while node.expanded():
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
                 current_tree_depth += 1
-                action, node = self.select_child(node, min_max_stats)
-                search_path.append(node)
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
 
                 # Players play turn by turn
                 if virtual_to_play + 1 < len(self.config.players):
@@ -333,15 +371,18 @@ def run(
                 else:
                     virtual_to_play = self.config.players[0]
 
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
             # Inside the search tree we use the dynamics function to obtain the next hidden
             # state given an action and the previous hidden state
-            parent = search_path[-2]
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
             value, reward, policy_logits, hidden_state = model.recurrent_inference(
                 parent.hidden_state,
                 torch.tensor([[action]]).to(parent.hidden_state.device),
             )
             value = models.support_to_scalar(value, self.config.support_size).item()
             reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
             node.expand(
                 self.config.action_space,
                 virtual_to_play,
@@ -360,6 +401,9 @@ def run(
         }
         return root, extra_info
 
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
     def select_child(self, node, min_max_stats):
         """
         Select the child with the highest UCB score.
@@ -368,7 +412,7 @@ def select_child(self, node, min_max_stats):
             self.ucb_score(node, child, min_max_stats)
             for action, child in node.children.items()
         )
-        action = numpy.random.choice(
+        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
             [
                 action
                 for action, child in node.children.items()
@@ -377,33 +421,37 @@ def select_child(self, node, min_max_stats):
         )
         return action, node.children[action]
 
-    def ucb_score(self, parent, child, min_max_stats):
+    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
         """
         The score for a node is based on its value, plus an exploration bonus based on the prior.
         """
         pb_c = (
             math.log(
-                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base
+                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
             )
             + self.config.pb_c_init
         )
         pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
 
-        prior_score = pb_c * child.prior
+        prior_score = pb_c * child.prior # prior 之前的p_value
+        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+        # prior_score = pbc * prior
 
         if child.visit_count > 0:
             # Mean value Q
-            value_score = min_max_stats.normalize(
+            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
                 child.reward
-                + self.config.discount
-                * (child.value() if len(self.config.players) == 1 else -child.value())
+                + self.config.discount # 衰减系数， 之后乘以子节点的值
+                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
             )
         else:
             value_score = 0
 
-        return prior_score + value_score
+        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
 
-    def backpropagate(self, search_path, value, to_play, min_max_stats):
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
         """
         At the end of a simulation, we propagate the evaluation all the way up the tree
         to the root.
@@ -432,7 +480,7 @@ def backpropagate(self, search_path, value, to_play, min_max_stats):
 
 class Node:
     def __init__(self, prior):
-        self.visit_count = 0
+        self.visit_count = 0 #visit count默认是0，只有经过反向传播之后才能变成增加
         self.to_play = -1
         self.prior = prior
         self.value_sum = 0
@@ -449,6 +497,8 @@ def value(self):
         return self.value_sum / self.visit_count
 
     def expand(self, actions, to_play, reward, policy_logits, hidden_state):
+        # expand一层节点，actions是动作列表，policy_logits是rewards列表
+        # 通过该函数，在该节点扩展一层节点
         """
         We expand a node using the value, reward and policy prediction obtained from the
         neural network.
@@ -460,7 +510,7 @@ def expand(self, actions, to_play, reward, policy_logits, hidden_state):
         policy_values = torch.softmax(
             torch.tensor([policy_logits[0][a] for a in actions]), dim=0
         ).tolist()
-        policy = {a: policy_values[i] for i, a in enumerate(actions)}
+        policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值
         for action, p in policy.items():
             self.children[action] = Node(p)
 
@@ -512,7 +562,7 @@ def store_search_statistics(self, root, action_space):
 
     def get_stacked_observations(
         self, index, num_stacked_observations, action_space_size
-    ):
+    ): #根据索引index获取observation序列
         """
         Generate a new observation with the observation at the index position
         and num_stacked_observations past observations and actions stacked.
@@ -520,12 +570,12 @@ def get_stacked_observations(
         # Convert to positive index
         index = index % len(self.observation_history)
 
-        stacked_observations = self.observation_history[index].copy()
+        stacked_observations = self.observation_history[index].copy() #分为两部分，一部分是当前（current）观察值，一部分是之前的(previous)观察值
         for past_observation_index in reversed(
             range(index - num_stacked_observations, index)
         ):
             if 0 <= past_observation_index:
-                previous_observation = numpy.concatenate(
+                previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来，方法是依次拆开每个元素，拼接
                     (
                         self.observation_history[past_observation_index],
                         [
@@ -543,7 +593,7 @@ def get_stacked_observations(
                     )
                 )
 
-            stacked_observations = numpy.concatenate(
+            stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容
                 (stacked_observations, previous_observation)
             )
 
@@ -556,15 +606,16 @@ class MinMaxStats:
     """
 
     def __init__(self):
-        self.maximum = -float("inf")
-        self.minimum = float("inf")
+        self.maximum = -float("inf") # 最大是-∞
+        self.minimum = float("inf") # 最小是+∞
+        # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围
 
-    def update(self, value):
+    def update(self, value): # 更新max和min,方法时对比大小，大的更新为上限，小的更新为下限
         self.maximum = max(self.maximum, value)
         self.minimum = min(self.minimum, value)
 
-    def normalize(self, value):
-        if self.maximum > self.minimum:
+    def normalize(self, value): #对value规范化，公式为(x-a)/(a-b) 当x∈[a,b]时
+        if self.maximum > self.minimum: # 如果最大大于最小，说明至少更新了两次（第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围）
             # We normalize only when we have set the maximum and minimum values
             return (value - self.minimum) / (self.maximum - self.minimum)
-        return value
+        return value # 如果范围没有更新，就直接返回value
diff --git a/shared_storage.py b/shared_storage.py
index 5a70fe20..343fc313 100644
--- a/shared_storage.py
+++ b/shared_storage.py
@@ -9,21 +9,21 @@ class SharedStorage:
     """
     Class which run in a dedicated thread to store the network weights and some information.
     """
-
+    # SharedStorage定义了一个类，包含一个dict类型的config和当前的模型参数checkpoint。类通过ray访问
     def __init__(self, checkpoint, config):
         self.config = config
         self.current_checkpoint = copy.deepcopy(checkpoint)
 
-    def save_checkpoint(self, path=None):
+    def save_checkpoint(self, path=None): #将模型存储在文件中
         if not path:
             path = self.config.results_path / "model.checkpoint"
 
         torch.save(self.current_checkpoint, path)
 
-    def get_checkpoint(self):
+    def get_checkpoint(self): # 返回当前的模型参数，返回的是一个深拷贝，防止对当前模型的修改
         return copy.deepcopy(self.current_checkpoint)
 
-    def get_info(self, keys):
+    def get_info(self, keys): # 从config中获取参数
         if isinstance(keys, str):
             return self.current_checkpoint[keys]
         elif isinstance(keys, list):
@@ -31,7 +31,7 @@ def get_info(self, keys):
         else:
             raise TypeError
 
-    def set_info(self, keys, values=None):
+    def set_info(self, keys, values=None): # 向config中写入参数
         if isinstance(keys, str) and values is not None:
             self.current_checkpoint[keys] = values
         elif isinstance(keys, dict):
diff --git a/simplifiedMuZero/RHEA.py b/simplifiedMuZero/RHEA.py
new file mode 100644
index 00000000..d23c611b
--- /dev/null
+++ b/simplifiedMuZero/RHEA.py
@@ -0,0 +1,12 @@
+
+
+class RHEAIndividual:
+    def __init__(self, L:int, discount_factor:double, forword_model, state, play_id:int,
+                 seed, heuristic):
+        self.state = state
+        self.L = L
+        self.discount_factor = discount_factor
+        self.forword_model = forword_model
+        self.play_id = play_id
+        self.seed = seed
+        self.heuristic = heuristic
\ No newline at end of file
diff --git a/simplifiedMuZero/models2.py b/simplifiedMuZero/models2.py
new file mode 100644
index 00000000..4fb55bad
--- /dev/null
+++ b/simplifiedMuZero/models2.py
@@ -0,0 +1,150 @@
+import math
+from abc import ABC, abstractmethod
+
+import torch
+
+from models import *
+
+class SimplifiedMuZeroNetwork:
+    def __new__(cls, config):
+        if config.network == "fullyconnected":
+            return SimplifiedMuZeroFullyConnectedNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.encoding_size,
+                config.fc_reward_layers,
+                config.fc_value_layers,
+                config.fc_policy_layers,
+                config.fc_representation_layers,
+                config.fc_dynamics_layers,
+                config.support_size,
+            )
+        elif config.network == "resnet":
+            return MuZeroResidualNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.blocks,
+                config.channels,
+                config.reduced_channels_reward,
+                config.reduced_channels_value,
+                config.reduced_channels_policy,
+                config.resnet_fc_reward_layers,
+                config.resnet_fc_value_layers,
+                config.resnet_fc_policy_layers,
+                config.support_size,
+                config.downsample,
+            )
+        else:
+            raise NotImplementedError(
+                'The network parameter should be "fullyconnected" or "resnet".'
+            )
+class SimplifiedMuZeroFullyConnectedNetwork(AbstractNetwork):
+    def __init__(self,
+                 observation_shape,
+                 stacked_observations,
+                 action_space_size,
+                 encoding_size,
+                 fc_reward_layers,
+                 fc_value_layers,
+                 fc_policy_layers,
+                 fc_representation_layers,
+                 fc_dynamics_layers,
+                 support_size,
+                 ):
+        super().__init__()
+        # 动作空间大小
+        self.action_space_size = action_space_size
+        #为什么是2*support_size +1
+        self.full_support_size = 2 * support_size + 1
+        representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * (stacked_observations + 1)\
+                                    + stacked_observations * observation_shape[1] * observation_shape[2]
+
+        # 改进方法：
+        #   1. input size = encoding _size
+        #   2. input 后边加上 action space
+        self.representation_network = torch.nn.DataParallel(
+            mlp(
+                representation_input_size,
+                fc_representation_layers,
+                encoding_size
+            )
+        )
+
+        self.dynamic_encoded_state_network = torch.nn.DataParallel(
+            mlp(
+                encoding_size +self.action_space_size,
+                fc_dynamics_layers,
+                encoding_size
+            )
+        )
+
+        self.dynamics_reward_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_reward_layers, self.full_support_size)
+        )
+
+        self.prediction_polic_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_policy_layers, self.action_space_size)
+        )
+
+        self.prediction_value_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_value_layers, self.full_support_size)
+        )
+
+    def prediction(self, encode_state):
+        policy_logits = self.prediction_polic_network(encode_state)
+        value = self.prediction_value_network(encode_state)
+        return policy_logits, value
+
+    # 将encoded_stated标准化
+    def encoded_stated_normalized(self, encoded_state):
+        min_encoded_state = encoded_state.min(1, keepdim=True)[0]
+        max_encoded_state = encoded_state.max(1, keepdim=True)[0]
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5  # 防止为0，出现NAN
+        encoded_state_normalized = (encoded_state - min_encoded_state) / scale_encoded_state
+
+        return encoded_state_normalized
+
+    def representation(self, observation):
+        encoded_state = self.representation_network(
+            observation.view(observation.shape[0], -1)
+        )
+
+        return self.encoded_stated_normalized(encoded_state)
+
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
+    def dynamics(self, encoded_state, action):
+        action_one_hot = (torch.zeros((action.shape[0], self.action_space_size)).to(action.device).float())
+        action_one_hot.scatter(1, action.long(), 1.0)
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+
+        next_encoded_state = self.dynamic_encoded_state_network(x)
+
+        reward = self.dynamics_reward_network(next_encoded_state)
+
+        next_encoded_state_normalized = self.encoded_stated_normalized(next_encoded_state)
+
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
+                .repeat(len(observation), 1)
+                .to(observation.device)
+            )
+        )
+
+        return (value, reward, policy_logits, encoded_state)
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
diff --git a/simplifiedMuZero/models_2net.py b/simplifiedMuZero/models_2net.py
new file mode 100644
index 00000000..0a5428df
--- /dev/null
+++ b/simplifiedMuZero/models_2net.py
@@ -0,0 +1,696 @@
+import math
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class SimplifiedMuZeroNetwork:
+    def __new__(cls, config):
+        if config.network == "fullyconnected":
+            return SimplifiedMuZeroFullyConnectedNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.encoding_size,
+                config.fc_reward_layers,
+                config.fc_value_layers,
+                config.fc_policy_layers,
+                config.fc_representation_layers,
+                config.fc_dynamics_layers,
+                config.support_size,
+            )
+        elif config.network == "resnet":
+            return MuZeroResidualNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.blocks,
+                config.channels,
+                config.reduced_channels_reward,
+                config.reduced_channels_value,
+                config.reduced_channels_policy,
+                config.resnet_fc_reward_layers,
+                config.resnet_fc_value_layers,
+                config.resnet_fc_policy_layers,
+                config.support_size,
+                config.downsample,
+            )
+        else:
+            raise NotImplementedError(
+                'The network parameter should be "fullyconnected" or "resnet".'
+            )
+
+
+def dict_to_cpu(dictionary):
+    cpu_dict = {}
+    for key, value in dictionary.items():
+        if isinstance(value, torch.Tensor):
+            cpu_dict[key] = value.cpu()
+        elif isinstance(value, dict):
+            cpu_dict[key] = dict_to_cpu(value)
+        else:
+            cpu_dict[key] = value
+    return cpu_dict
+
+
+class AbstractNetwork(ABC, torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    @abstractmethod
+    def initial_inference(self, observation):
+        pass
+
+    @abstractmethod
+    def recurrent_inference(self, encoded_state, action):
+        pass
+
+    def get_weights(self):
+        return dict_to_cpu(self.state_dict())
+
+    def set_weights(self, weights):
+        self.load_state_dict(weights)
+
+
+##################################
+######## Fully Connected #########
+
+
+class SimplifiedMuZeroFullyConnectedNetwork(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        encoding_size,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        fc_representation_layers,
+        fc_dynamics_layers,
+        support_size,
+    ):
+        super().__init__()
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
+
+        representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * (
+                    stacked_observations + 1) \
+                                    + stacked_observations * observation_shape[1] * observation_shape[2]
+
+        encoding_size = representation_input_size
+
+        self.representation_network = torch.nn.DataParallel(
+            # mlp(
+            #     representation_input_size,
+            #     fc_representation_layers,
+            #     encoding_size,
+            # )
+            mlp(
+                representation_input_size + self.action_space_size,
+                fc_representation_layers,
+                encoding_size,
+            )
+        )
+
+        #dynamics的输入是encoding_size+action_space_size
+        self.dynamics_encoded_state_network = torch.nn.DataParallel(
+            mlp(
+                encoding_size + self.action_space_size,
+                fc_dynamics_layers,
+                encoding_size,
+            )
+        )
+        self.dynamics_reward_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+        self.prediction_policy_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
+        )
+        self.prediction_value_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+    def prediction(self, encoded_state):
+        policy_logits = self.prediction_policy_network(encoded_state)
+        value = self.prediction_value_network(encoded_state)
+        return policy_logits, value
+
+        # 将encoded_stated标准化
+    def encoded_stated_normalized(self, encoded_state):
+        min_encoded_state = encoded_state.min(1, keepdim=True)[0]
+        max_encoded_state = encoded_state.max(1, keepdim=True)[0]
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5  # 防止为0，出现NAN
+        encoded_state_normalized = (encoded_state - min_encoded_state) / scale_encoded_state
+
+        return encoded_state_normalized
+    def representation(self, observation):
+        observation = observation.view(observation.shape[0], -1)
+        action_zeros = (torch.zeros((observation.shape[0], self.action_space_size)).to(observation.device).float())
+        x = torch.cat((observation, action_zeros), dim=1)
+
+        # encoded_state = self.representation_network(x)
+        encoded_state = self.dynamics_encoded_state_network(x)
+
+        # encoded_state = self.representation_network(
+        #     observation.view(observation.shape[0], -1)
+        # )
+
+        return self.encoded_stated_normalized(encoded_state)
+
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
+    def dynamics(self, encoded_state, action):
+        action_one_hot = (torch.zeros((action.shape[0], self.action_space_size)).to(action.device).float())
+        action_one_hot.scatter(1, action.long(), 1.0)
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+
+        next_encoded_state = self.dynamics_encoded_state_network(x)
+
+        reward = self.dynamics_reward_network(next_encoded_state)
+        next_encoded_state_normalized = self.encoded_stated_normalized(next_encoded_state)
+
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency 一致性奖励等于 0
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
+                .repeat(len(observation), 1)
+                .to(observation.device)
+            )
+        )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
+
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+
+###### End Fully Connected #######
+##################################
+
+
+##################################
+############# ResNet #############
+
+
+def conv3x3(in_channels, out_channels, stride=1):
+    return torch.nn.Conv2d(
+        in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+
+
+# Residual block
+class ResidualBlock(torch.nn.Module):
+    def __init__(self, num_channels, stride=1):
+        super().__init__()
+        self.conv1 = conv3x3(num_channels, num_channels, stride)
+        self.bn1 = torch.nn.BatchNorm2d(num_channels)
+        self.conv2 = conv3x3(num_channels, num_channels)
+        self.bn2 = torch.nn.BatchNorm2d(num_channels)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = torch.nn.functional.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += x
+        out = torch.nn.functional.relu(out)
+        return out
+
+
+# Downsample observations before representation network (See paper appendix Network Architecture)
+class DownSample(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(
+            in_channels,
+            out_channels // 2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.resblocks1 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels // 2) for _ in range(2)]
+        )
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.resblocks2 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels) for _ in range(3)]
+        )
+        self.pooling1 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        self.resblocks3 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels) for _ in range(3)]
+        )
+        self.pooling2 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        for block in self.resblocks1:
+            x = block(x)
+        x = self.conv2(x)
+        for block in self.resblocks2:
+            x = block(x)
+        x = self.pooling1(x)
+        for block in self.resblocks3:
+            x = block(x)
+        x = self.pooling2(x)
+        return x
+
+
+class DownsampleCNN(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, h_w):
+        super().__init__()
+        mid_channels = (in_channels + out_channels) // 2
+        self.features = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                in_channels, mid_channels, kernel_size=h_w[0] * 2, stride=4, padding=2
+            ),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=3, stride=2),
+            torch.nn.Conv2d(mid_channels, out_channels, kernel_size=5, padding=2),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(h_w)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        return x
+
+
+class RepresentationNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        num_blocks,
+        num_channels,
+        downsample,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if self.downsample:
+            if self.downsample == "resnet":
+                self.downsample_net = DownSample(
+                    observation_shape[0] * (stacked_observations + 1)
+                    + stacked_observations,
+                    num_channels,
+                )
+            elif self.downsample == "CNN":
+                self.downsample_net = DownsampleCNN(
+                    observation_shape[0] * (stacked_observations + 1)
+                    + stacked_observations,
+                    num_channels,
+                    (
+                        math.ceil(observation_shape[1] / 16),
+                        math.ceil(observation_shape[2] / 16),
+                    ),
+                )
+            else:
+                raise NotImplementedError('downsample should be "resnet" or "CNN".')
+        self.conv = conv3x3(
+            observation_shape[0] * (stacked_observations + 1) + stacked_observations,
+            num_channels,
+        )
+        self.bn = torch.nn.BatchNorm2d(num_channels)
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels) for _ in range(num_blocks)]
+        )
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample_net(x)
+        else:
+            x = self.conv(x)
+            x = self.bn(x)
+            x = torch.nn.functional.relu(x)
+
+        for block in self.resblocks:
+            x = block(x)
+        return x
+
+
+class DynamicsNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        fc_reward_layers,
+        full_support_size,
+        block_output_size_reward,
+    ):
+        super().__init__()
+        self.conv = conv3x3(num_channels, num_channels - 1)
+        self.bn = torch.nn.BatchNorm2d(num_channels - 1)
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels - 1) for _ in range(num_blocks)]
+        )
+
+        self.conv1x1_reward = torch.nn.Conv2d(
+            num_channels - 1, reduced_channels_reward, 1
+        )
+        self.block_output_size_reward = block_output_size_reward
+        self.fc = mlp(
+            self.block_output_size_reward,
+            fc_reward_layers,
+            full_support_size,
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = torch.nn.functional.relu(x)
+        for block in self.resblocks:
+            x = block(x)
+        state = x
+        x = self.conv1x1_reward(x)
+        x = x.view(-1, self.block_output_size_reward)
+        reward = self.fc(x)
+        return state, reward
+
+
+class PredictionNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_value_layers,
+        fc_policy_layers,
+        full_support_size,
+        block_output_size_value,
+        block_output_size_policy,
+    ):
+        super().__init__()
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels) for _ in range(num_blocks)]
+        )
+
+        self.conv1x1_value = torch.nn.Conv2d(num_channels, reduced_channels_value, 1)
+        self.conv1x1_policy = torch.nn.Conv2d(num_channels, reduced_channels_policy, 1)
+        self.block_output_size_value = block_output_size_value
+        self.block_output_size_policy = block_output_size_policy
+        self.fc_value = mlp(
+            self.block_output_size_value, fc_value_layers, full_support_size
+        )
+        self.fc_policy = mlp(
+            self.block_output_size_policy,
+            fc_policy_layers,
+            action_space_size,
+        )
+
+    def forward(self, x):
+        for block in self.resblocks:
+            x = block(x)
+        value = self.conv1x1_value(x)
+        policy = self.conv1x1_policy(x)
+        value = value.view(-1, self.block_output_size_value)
+        policy = policy.view(-1, self.block_output_size_policy)
+        value = self.fc_value(value)
+        policy = self.fc_policy(policy)
+        return policy, value
+
+
+class MuZeroResidualNetwork(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        support_size,
+        downsample,
+    ):
+        super().__init__()
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        block_output_size_reward = (
+            (
+                reduced_channels_reward
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_reward * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_value = (
+            (
+                reduced_channels_value
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_value * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_policy = (
+            (
+                reduced_channels_policy
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_policy * observation_shape[1] * observation_shape[2])
+        )
+
+        self.representation_network = torch.nn.DataParallel(
+            RepresentationNetwork(
+                observation_shape,
+                stacked_observations,
+                num_blocks,
+                num_channels,
+                downsample,
+            )
+        )
+
+        self.dynamics_network = torch.nn.DataParallel(
+            DynamicsNetwork(
+                num_blocks,
+                num_channels + 1,
+                reduced_channels_reward,
+                fc_reward_layers,
+                self.full_support_size,
+                block_output_size_reward,
+            )
+        )
+
+        self.prediction_network = torch.nn.DataParallel(
+            PredictionNetwork(
+                action_space_size,
+                num_blocks,
+                num_channels,
+                reduced_channels_value,
+                reduced_channels_policy,
+                fc_value_layers,
+                fc_policy_layers,
+                self.full_support_size,
+                block_output_size_value,
+                block_output_size_policy,
+            )
+        )
+
+    def prediction(self, encoded_state):
+        policy, value = self.prediction_network(encoded_state)
+        return policy, value
+
+    def representation(self, observation):
+        encoded_state = self.representation_network(observation)
+
+        # Scale encoded state between [0, 1] (See appendix paper Training)
+        min_encoded_state = (
+            encoded_state.view(
+                -1,
+                encoded_state.shape[1],
+                encoded_state.shape[2] * encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_encoded_state = (
+            encoded_state.view(
+                -1,
+                encoded_state.shape[1],
+                encoded_state.shape[2] * encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+        encoded_state_normalized = (
+            encoded_state - min_encoded_state
+        ) / scale_encoded_state
+        return encoded_state_normalized
+
+    def dynamics(self, encoded_state, action):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.ones(
+                (
+                    encoded_state.shape[0],
+                    1,
+                    encoded_state.shape[2],
+                    encoded_state.shape[3],
+                )
+            )
+            .to(action.device)
+            .float()
+        )
+        action_one_hot = (
+            action[:, :, None, None] * action_one_hot / self.action_space_size
+        )
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+        next_encoded_state, reward = self.dynamics_network(x)
+
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        next_encoded_state_normalized = (
+            next_encoded_state - min_next_encoded_state
+        ) / scale_next_encoded_state
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
+                .to(observation.device)
+            )
+        )
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+
+########### End ResNet ###########
+##################################
+
+
+def mlp(
+    input_size,
+    layer_sizes,
+    output_size,
+    output_activation=torch.nn.Identity,
+    activation=torch.nn.ELU,
+):
+    sizes = [input_size] + layer_sizes + [output_size]
+    layers = []
+    for i in range(len(sizes) - 1):
+        act = activation if i < len(sizes) - 2 else output_activation #激活函数，最后一层是output_activation，其余的都一样
+        layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
+    return torch.nn.Sequential(*layers)
+
+
+def support_to_scalar(logits, support_size): # logits 是 value的对数值，support_size是转换后的范围。
+    """
+    Transform a categorical representation to a scalar
+    See paper appendix Network Architecture
+    """
+    # Decode to a scalar
+    probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1，softmax扩大大的，缩小下的，shape为[stacked_size, fully_support_size]
+    support = (
+        torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
+        .expand(probabilities.shape)
+        .float()
+        .to(device=probabilities.device)
+    ) # shape 为【stacked_size, fully_support_size】，
+    x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1，fully_support_size】，因为dim=1，另外keep_dim=True，所有是【1，fully_support_size】而不是【fully_support_size]
+
+    # Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
+    x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1，大于0为1，0为0。主要是获取x的符号
+        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
+        ** 2
+        - 1
+    )
+    return x
+
+
+def scalar_to_support(x, support_size):
+    """
+    Transform a scalar to a categorical representation with (2 * support_size + 1) categories
+    See paper appendix Network Architecture
+    """
+    # Reduce the scale (defined in https://arxiv.org/abs/1805.11593)
+    x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x
+
+    # Encode on a vector
+    x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围，使x的范围定为[-support_size, support_size]
+    floor = x.floor() # floor向下取整，类似的，ceil为向上取整
+    prob = x - floor # 减去整数，保留小数部分（因为在support_to_scala部分是index位置乘上概率)
+    logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
+    logits.scatter_(
+        2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)
+    )
+    indexes = floor + support_size + 1
+    prob = prob.masked_fill_(2 * support_size < indexes, 0.0)
+    indexes = indexes.masked_fill_(2 * support_size < indexes, 0.0)
+    logits.scatter_(2, indexes.long().unsqueeze(-1), prob.unsqueeze(-1))
+    return logits
diff --git a/simplifiedMuZero/models_without_replay_buffer.py b/simplifiedMuZero/models_without_replay_buffer.py
new file mode 100644
index 00000000..d4b8bc2f
--- /dev/null
+++ b/simplifiedMuZero/models_without_replay_buffer.py
@@ -0,0 +1,696 @@
+import math
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class MuZeroNetwork:
+    def __new__(cls, config):
+        if config.network == "fullyconnected":
+            return MuZeroFullyConnectedNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.encoding_size,
+                config.fc_reward_layers,
+                config.fc_value_layers,
+                config.fc_policy_layers,
+                config.fc_representation_layers,
+                config.fc_dynamics_layers,
+                config.support_size,
+            )
+        elif config.network == "resnet":
+            return MuZeroResidualNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.blocks,
+                config.channels,
+                config.reduced_channels_reward,
+                config.reduced_channels_value,
+                config.reduced_channels_policy,
+                config.resnet_fc_reward_layers,
+                config.resnet_fc_value_layers,
+                config.resnet_fc_policy_layers,
+                config.support_size,
+                config.downsample,
+            )
+        else:
+            raise NotImplementedError(
+                'The network parameter should be "fullyconnected" or "resnet".'
+            )
+
+
+def dict_to_cpu(dictionary):
+    cpu_dict = {}
+    for key, value in dictionary.items():
+        if isinstance(value, torch.Tensor):
+            cpu_dict[key] = value.cpu()
+        elif isinstance(value, dict):
+            cpu_dict[key] = dict_to_cpu(value)
+        else:
+            cpu_dict[key] = value
+    return cpu_dict
+
+
+class AbstractNetwork(ABC, torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    @abstractmethod
+    def initial_inference(self, observation):
+        pass
+
+    @abstractmethod
+    def recurrent_inference(self, encoded_state, action):
+        pass
+
+    def get_weights(self):
+        return dict_to_cpu(self.state_dict())
+
+    def set_weights(self, weights):
+        self.load_state_dict(weights)
+
+
+##################################
+######## Fully Connected #########
+
+
+class MuZeroFullyConnectedNetwork(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        encoding_size,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        fc_representation_layers,
+        fc_dynamics_layers,
+        support_size,
+    ):
+        super().__init__()
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
+
+        self.representation_network = torch.nn.DataParallel(
+            mlp(
+                observation_shape[0]
+                * observation_shape[1]
+                * observation_shape[2]
+                * (stacked_observations + 1)
+                + stacked_observations * observation_shape[1] * observation_shape[2],
+                fc_representation_layers,
+                encoding_size,
+            )
+        )
+
+        #dynamics的输入是encoding_size+action_space_size
+        self.dynamics_encoded_state_network = torch.nn.DataParallel(
+            mlp(
+                encoding_size + self.action_space_size,
+                fc_dynamics_layers,
+                encoding_size,
+            )
+        )
+        self.dynamics_reward_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+        self.prediction_policy_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
+        )
+        self.prediction_value_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+    def prediction(self, encoded_state):
+        policy_logits = self.prediction_policy_network(encoded_state)
+        value = self.prediction_value_network(encoded_state)
+        return policy_logits, value
+
+    def representation(self, observation):
+        encoded_state = self.representation_network(
+            observation.view(observation.shape[0], -1)
+        )
+
+        # 正则化
+        # Scale encoded state between [0, 1] (See appendix paper Training)
+        min_encoded_state = encoded_state.min(1, keepdim=True)[0]
+        max_encoded_state = encoded_state.max(1, keepdim=True)[0]
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
+        encoded_state_normalized = (
+            encoded_state - min_encoded_state
+        ) / scale_encoded_state
+        return encoded_state_normalized
+
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
+    def dynamics(self, encoded_state, action):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.zeros((action.shape[0], self.action_space_size))
+            .to(action.device)
+            .float()
+        )
+        action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+
+        next_encoded_state = self.dynamics_encoded_state_network(x)
+
+        reward = self.dynamics_reward_network(next_encoded_state)
+
+        # 正则化
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0]
+        max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0]
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
+        next_encoded_state_normalized = (
+            next_encoded_state - min_next_encoded_state
+        ) / scale_next_encoded_state
+
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency 一致性奖励等于 0
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
+                .repeat(len(observation), 1)
+                .to(observation.device)
+            )
+        )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
+
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+
+###### End Fully Connected #######
+##################################
+
+
+##################################
+############# ResNet #############
+
+
+def conv3x3(in_channels, out_channels, stride=1):
+    return torch.nn.Conv2d(
+        in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+
+
+# Residual block
+class ResidualBlock(torch.nn.Module):
+    def __init__(self, num_channels, stride=1):
+        super().__init__()
+        self.conv1 = conv3x3(num_channels, num_channels, stride)
+        self.bn1 = torch.nn.BatchNorm2d(num_channels)
+        self.conv2 = conv3x3(num_channels, num_channels)
+        self.bn2 = torch.nn.BatchNorm2d(num_channels)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = torch.nn.functional.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += x
+        out = torch.nn.functional.relu(out)
+        return out
+
+
+# Downsample observations before representation network (See paper appendix Network Architecture)
+class DownSample(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(
+            in_channels,
+            out_channels // 2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.resblocks1 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels // 2) for _ in range(2)]
+        )
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.resblocks2 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels) for _ in range(3)]
+        )
+        self.pooling1 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        self.resblocks3 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels) for _ in range(3)]
+        )
+        self.pooling2 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        for block in self.resblocks1:
+            x = block(x)
+        x = self.conv2(x)
+        for block in self.resblocks2:
+            x = block(x)
+        x = self.pooling1(x)
+        for block in self.resblocks3:
+            x = block(x)
+        x = self.pooling2(x)
+        return x
+
+
+class DownsampleCNN(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, h_w):
+        super().__init__()
+        mid_channels = (in_channels + out_channels) // 2
+        self.features = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                in_channels, mid_channels, kernel_size=h_w[0] * 2, stride=4, padding=2
+            ),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=3, stride=2),
+            torch.nn.Conv2d(mid_channels, out_channels, kernel_size=5, padding=2),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(h_w)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        return x
+
+
+class RepresentationNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        num_blocks,
+        num_channels,
+        downsample,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if self.downsample:
+            if self.downsample == "resnet":
+                self.downsample_net = DownSample(
+                    observation_shape[0] * (stacked_observations + 1)
+                    + stacked_observations,
+                    num_channels,
+                )
+            elif self.downsample == "CNN":
+                self.downsample_net = DownsampleCNN(
+                    observation_shape[0] * (stacked_observations + 1)
+                    + stacked_observations,
+                    num_channels,
+                    (
+                        math.ceil(observation_shape[1] / 16),
+                        math.ceil(observation_shape[2] / 16),
+                    ),
+                )
+            else:
+                raise NotImplementedError('downsample should be "resnet" or "CNN".')
+        self.conv = conv3x3(
+            observation_shape[0] * (stacked_observations + 1) + stacked_observations,
+            num_channels,
+        )
+        self.bn = torch.nn.BatchNorm2d(num_channels)
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels) for _ in range(num_blocks)]
+        )
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample_net(x)
+        else:
+            x = self.conv(x)
+            x = self.bn(x)
+            x = torch.nn.functional.relu(x)
+
+        for block in self.resblocks:
+            x = block(x)
+        return x
+
+
+class DynamicsNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        fc_reward_layers,
+        full_support_size,
+        block_output_size_reward,
+    ):
+        super().__init__()
+        self.conv = conv3x3(num_channels, num_channels - 1)
+        self.bn = torch.nn.BatchNorm2d(num_channels - 1)
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels - 1) for _ in range(num_blocks)]
+        )
+
+        self.conv1x1_reward = torch.nn.Conv2d(
+            num_channels - 1, reduced_channels_reward, 1
+        )
+        self.block_output_size_reward = block_output_size_reward
+        self.fc = mlp(
+            self.block_output_size_reward,
+            fc_reward_layers,
+            full_support_size,
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = torch.nn.functional.relu(x)
+        for block in self.resblocks:
+            x = block(x)
+        state = x
+        x = self.conv1x1_reward(x)
+        x = x.view(-1, self.block_output_size_reward)
+        reward = self.fc(x)
+        return state, reward
+
+
+class PredictionNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_value_layers,
+        fc_policy_layers,
+        full_support_size,
+        block_output_size_value,
+        block_output_size_policy,
+    ):
+        super().__init__()
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels) for _ in range(num_blocks)]
+        )
+
+        self.conv1x1_value = torch.nn.Conv2d(num_channels, reduced_channels_value, 1)
+        self.conv1x1_policy = torch.nn.Conv2d(num_channels, reduced_channels_policy, 1)
+        self.block_output_size_value = block_output_size_value
+        self.block_output_size_policy = block_output_size_policy
+        self.fc_value = mlp(
+            self.block_output_size_value, fc_value_layers, full_support_size
+        )
+        self.fc_policy = mlp(
+            self.block_output_size_policy,
+            fc_policy_layers,
+            action_space_size,
+        )
+
+    def forward(self, x):
+        for block in self.resblocks:
+            x = block(x)
+        value = self.conv1x1_value(x)
+        policy = self.conv1x1_policy(x)
+        value = value.view(-1, self.block_output_size_value)
+        policy = policy.view(-1, self.block_output_size_policy)
+        value = self.fc_value(value)
+        policy = self.fc_policy(policy)
+        return policy, value
+
+
+class MuZeroResidualNetwork(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        support_size,
+        downsample,
+    ):
+        super().__init__()
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        block_output_size_reward = (
+            (
+                reduced_channels_reward
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_reward * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_value = (
+            (
+                reduced_channels_value
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_value * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_policy = (
+            (
+                reduced_channels_policy
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_policy * observation_shape[1] * observation_shape[2])
+        )
+
+        self.representation_network = torch.nn.DataParallel(
+            RepresentationNetwork(
+                observation_shape,
+                stacked_observations,
+                num_blocks,
+                num_channels,
+                downsample,
+            )
+        )
+
+        self.dynamics_network = torch.nn.DataParallel(
+            DynamicsNetwork(
+                num_blocks,
+                num_channels + 1,
+                reduced_channels_reward,
+                fc_reward_layers,
+                self.full_support_size,
+                block_output_size_reward,
+            )
+        )
+
+        self.prediction_network = torch.nn.DataParallel(
+            PredictionNetwork(
+                action_space_size,
+                num_blocks,
+                num_channels,
+                reduced_channels_value,
+                reduced_channels_policy,
+                fc_value_layers,
+                fc_policy_layers,
+                self.full_support_size,
+                block_output_size_value,
+                block_output_size_policy,
+            )
+        )
+
+    def prediction(self, encoded_state):
+        policy, value = self.prediction_network(encoded_state)
+        return policy, value
+
+    def representation(self, observation):
+        encoded_state = self.representation_network(observation)
+
+        # Scale encoded state between [0, 1] (See appendix paper Training)
+        min_encoded_state = (
+            encoded_state.view(
+                -1,
+                encoded_state.shape[1],
+                encoded_state.shape[2] * encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_encoded_state = (
+            encoded_state.view(
+                -1,
+                encoded_state.shape[1],
+                encoded_state.shape[2] * encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+        encoded_state_normalized = (
+            encoded_state - min_encoded_state
+        ) / scale_encoded_state
+        return encoded_state_normalized
+
+    def dynamics(self, encoded_state, action):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.ones(
+                (
+                    encoded_state.shape[0],
+                    1,
+                    encoded_state.shape[2],
+                    encoded_state.shape[3],
+                )
+            )
+            .to(action.device)
+            .float()
+        )
+        action_one_hot = (
+            action[:, :, None, None] * action_one_hot / self.action_space_size
+        )
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+        next_encoded_state, reward = self.dynamics_network(x)
+
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        next_encoded_state_normalized = (
+            next_encoded_state - min_next_encoded_state
+        ) / scale_next_encoded_state
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
+                .to(observation.device)
+            )
+        )
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+
+########### End ResNet ###########
+##################################
+
+
+def mlp(
+    input_size,
+    layer_sizes,
+    output_size,
+    output_activation=torch.nn.Identity,
+    activation=torch.nn.ELU,
+):
+    sizes = [input_size] + layer_sizes + [output_size]
+    layers = []
+    for i in range(len(sizes) - 1):
+        act = activation if i < len(sizes) - 2 else output_activation #激活函数，最后一层是output_activation，其余的都一样
+        layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
+    return torch.nn.Sequential(*layers)
+
+
+def support_to_scalar(logits, support_size): # logits 是 value的对数值，support_size是转换后的范围。
+    """
+    Transform a categorical representation to a scalar
+    See paper appendix Network Architecture
+    """
+    # Decode to a scalar
+    probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1，softmax扩大大的，缩小下的，shape为[stacked_size, fully_support_size]
+    support = (
+        torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
+        .expand(probabilities.shape)
+        .float()
+        .to(device=probabilities.device)
+    ) # shape 为【stacked_size, fully_support_size】，
+    x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1，fully_support_size】，因为dim=1，另外keep_dim=True，所有是【1，fully_support_size】而不是【fully_support_size]
+
+    # Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
+    x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1，大于0为1，0为0。主要是获取x的符号
+        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
+        ** 2
+        - 1
+    )
+    return x
+
+
+def scalar_to_support(x, support_size):
+    """
+    Transform a scalar to a categorical representation with (2 * support_size + 1) categories
+    See paper appendix Network Architecture
+    """
+    # Reduce the scale (defined in https://arxiv.org/abs/1805.11593)
+    x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x
+
+    # Encode on a vector
+    x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围，使x的范围定为[-support_size, support_size]
+    floor = x.floor() # floor向下取整，类似的，ceil为向上取整
+    prob = x - floor # 减去整数，保留小数部分（因为在support_to_scala部分是index位置乘上概率)
+    logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
+    logits.scatter_(
+        2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)
+    )
+    indexes = floor + support_size + 1
+    prob = prob.masked_fill_(2 * support_size < indexes, 0.0)
+    indexes = indexes.masked_fill_(2 * support_size < indexes, 0.0)
+    logits.scatter_(2, indexes.long().unsqueeze(-1), prob.unsqueeze(-1))
+    return logits
diff --git a/simplifiedMuZero/muzero_without_replay_buffer.py b/simplifiedMuZero/muzero_without_replay_buffer.py
new file mode 100644
index 00000000..37436e79
--- /dev/null
+++ b/simplifiedMuZero/muzero_without_replay_buffer.py
@@ -0,0 +1,723 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+import models
+import replay_buffer
+import self_play
+import shared_storage
+import trainer
+
+
+class MuZero:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        #使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlay.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # 这里调用continuous类的函数，主要是continuous函数会调用replay_buffer，
+
+        # Launch workers
+        # 此处调用worker进行self play，把结果存在replay_buffer里
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+
+        # 此处使用trainer，从replay buffer里按batch抽取数据，进行网络训练和更新
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        # 使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.MuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/simplifiedMuZero/replay_buffer3.py b/simplifiedMuZero/replay_buffer3.py
new file mode 100644
index 00000000..762d5a0e
--- /dev/null
+++ b/simplifiedMuZero/replay_buffer3.py
@@ -0,0 +1,373 @@
+import copy
+import time
+
+import numpy
+import ray
+import torch
+
+import simplifiedMuZero.models_2net as models
+
+
+@ray.remote
+class ReplayBuffer:
+    """
+    Class which run in a dedicated thread to store played games and generate batch.
+    """
+
+    def __init__(self, initial_checkpoint, initial_buffer, config):
+        self.config = config
+        self.buffer = copy.deepcopy(initial_buffer)
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
+        )
+        if self.total_samples != 0:
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+
+    def save_game(self, game_history, shared_storage=None):
+        if self.config.PER:
+            if game_history.priorities is not None:
+                # Avoid read only array when loading replay buffer from disk
+                game_history.priorities = numpy.copy(game_history.priorities)
+            else:
+                # Initial priorities for the prioritized replay (See paper appendix Training)
+                priorities = []
+                for i, root_value in enumerate(game_history.root_values):
+                    priority = (
+                        numpy.abs(
+                            root_value - self.compute_target_value(game_history, i)
+                        )
+                        ** self.config.PER_alpha
+                    )
+                    priorities.append(priority)
+
+                game_history.priorities = numpy.array(priorities, dtype="float32")
+                game_history.game_priority = numpy.max(game_history.priorities)
+
+        self.buffer[self.num_played_games] = game_history
+        self.num_played_games += 1
+        self.num_played_steps += len(game_history.root_values)
+        self.total_samples += len(game_history.root_values)
+
+        if self.config.replay_buffer_size < len(self.buffer):
+            del_id = self.num_played_games - len(self.buffer)
+            self.total_samples -= len(self.buffer[del_id].root_values)
+            del self.buffer[del_id]
+
+        if shared_storage:
+            shared_storage.set_info.remote("num_played_games", self.num_played_games)
+            shared_storage.set_info.remote("num_played_steps", self.num_played_steps)
+
+    def get_buffer(self):
+        return self.buffer
+
+    def get_batch(self):
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [])
+        weight_batch = [] if self.config.PER else None
+
+        for game_id, game_history, game_prob in self.sample_n_games(
+            self.config.batch_size
+        ):
+            game_pos, pos_prob = self.sample_position(game_history)
+
+            values, rewards, policies, actions = self.make_target(
+                game_history, game_pos
+            )
+
+            index_batch.append([game_id, game_pos])
+            observation_batch.append(
+                game_history.get_stacked_observations(
+                    game_pos,
+                    self.config.stacked_observations,
+                    len(self.config.action_space),
+                )
+            )
+            action_batch.append(actions)
+            value_batch.append(values)
+            reward_batch.append(rewards)
+            policy_batch.append(policies)
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
+            )
+            if self.config.PER:
+                weight_batch.append(1 / (self.total_samples * game_prob * pos_prob))
+
+        if self.config.PER:
+            weight_batch = numpy.array(weight_batch, dtype="float32") / max(
+                weight_batch
+            )
+
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1
+        # value_batch: batch, num_unroll_steps+1
+        # reward_batch: batch, num_unroll_steps+1
+        # policy_batch: batch, num_unroll_steps+1, len(action_space)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
+        )
+
+    def sample_game(self, force_uniform=False):
+        """
+        Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        game_prob = None
+        if self.config.PER and not force_uniform:
+            game_probs = numpy.array(
+                [game_history.game_priority for game_history in self.buffer.values()],
+                dtype="float32",
+            )
+            game_probs /= numpy.sum(game_probs)
+            game_index = numpy.random.choice(len(self.buffer), p=game_probs)
+            game_prob = game_probs[game_index]
+        else:
+            game_index = numpy.random.choice(len(self.buffer))
+        game_id = self.num_played_games - len(self.buffer) + game_index
+
+        return game_id, self.buffer[game_id], game_prob
+
+    def sample_n_games(self, n_games, force_uniform=False):
+        if self.config.PER and not force_uniform:
+            game_id_list = []
+            game_probs = []
+            for game_id, game_history in self.buffer.items():
+                game_id_list.append(game_id)
+                game_probs.append(game_history.game_priority)
+            game_probs = numpy.array(game_probs, dtype="float32")
+            game_probs /= numpy.sum(game_probs)
+            game_prob_dict = dict(
+                [(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)]
+            )
+            selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs)
+        else:
+            selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+            game_prob_dict = {}
+        ret = [
+            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+            for game_id in selected_games
+        ]
+        return ret
+
+    def sample_position(self, game_history, force_uniform=False):
+        """
+        Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        position_prob = None
+        if self.config.PER and not force_uniform:
+            position_probs = game_history.priorities / sum(game_history.priorities)
+            position_index = numpy.random.choice(len(position_probs), p=position_probs)
+            position_prob = position_probs[position_index]
+        else:
+            position_index = numpy.random.choice(len(game_history.root_values))
+
+        return position_index, position_prob
+
+    def update_game_history(self, game_id, game_history):
+        # The element could have been removed since its selection and update
+        if next(iter(self.buffer)) <= game_id:
+            if self.config.PER:
+                # Avoid read only array when loading replay buffer from disk
+                game_history.priorities = numpy.copy(game_history.priorities)
+            self.buffer[game_id] = game_history
+
+    def update_priorities(self, priorities, index_info):
+        """
+        Update game and position priorities with priorities calculated during the training.
+        See Distributed Prioritized Experience Replay https://arxiv.org/abs/1803.00933
+        """
+        for i in range(len(index_info)):
+            game_id, game_pos = index_info[i]
+
+            # The element could have been removed since its selection and training
+            if next(iter(self.buffer)) <= game_id:
+                # Update position priorities
+                priority = priorities[i, :]
+                start_index = game_pos
+                end_index = min(
+                    game_pos + len(priority), len(self.buffer[game_id].priorities)
+                )
+                self.buffer[game_id].priorities[start_index:end_index] = priority[
+                    : end_index - start_index
+                ]
+
+                # Update game priorities
+                self.buffer[game_id].game_priority = numpy.max(
+                    self.buffer[game_id].priorities
+                )
+
+    def compute_target_value(self, game_history, index):
+        # The value target is the discounted root value of the search tree td_steps into the
+        # future, plus the discounted sum of all rewards until then.
+        bootstrap_index = index + self.config.td_steps
+        if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            last_step_value = (
+                root_values[bootstrap_index]
+                if game_history.to_play_history[bootstrap_index]
+                == game_history.to_play_history[index]
+                else -root_values[bootstrap_index]
+            )
+
+            value = last_step_value * self.config.discount**self.config.td_steps
+        else:
+            value = 0
+
+        for i, reward in enumerate(
+            game_history.reward_history[index + 1 : bootstrap_index + 1]
+        ):
+            # The value is oriented from the perspective of the current player
+            value += (
+                reward
+                if game_history.to_play_history[index]
+                == game_history.to_play_history[index + i]
+                else -reward
+            ) * self.config.discount**i
+
+        return value
+
+    def make_target(self, game_history, state_index):
+        """
+        Generate targets for every unroll steps.
+        """
+        target_values, target_rewards, target_policies, actions = [], [], [], []
+        for current_index in range(
+            state_index, state_index + self.config.num_unroll_steps + 1
+        ):
+            value = self.compute_target_value(game_history, current_index)
+
+            if current_index < len(game_history.root_values):
+                target_values.append(value)
+                target_rewards.append(game_history.reward_history[current_index])
+                target_policies.append(game_history.child_visits[current_index])
+                actions.append(game_history.action_history[current_index])
+            elif current_index == len(game_history.root_values):
+                target_values.append(0)
+                target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(game_history.action_history[current_index])
+            else:
+                # States past the end of games are treated as absorbing states
+                target_values.append(0)
+                target_rewards.append(0)
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(numpy.random.choice(self.config.action_space))
+
+        return target_values, target_rewards, target_policies, actions
+
+
+@ray.remote
+class Reanalyse:
+    """
+    Class which run in a dedicated thread to update the replay buffer with fresh information.
+    See paper appendix Reanalyse.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.SimplifiedMuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu"))
+        self.model.eval()
+
+        self.num_reanalysed_games = initial_checkpoint["num_reanalysed_games"]
+
+    def reanalyse(self, replay_buffer, shared_storage):
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ):
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights")))
+
+            game_id, game_history, _ = ray.get(
+                replay_buffer.sample_game.remote(force_uniform=True)
+            )
+
+            # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+            if self.config.use_last_model_value:
+                observations = numpy.array(
+                    [
+                        game_history.get_stacked_observations(
+                            i,
+                            self.config.stacked_observations,
+                            len(self.config.action_space),
+                        )
+                        for i in range(len(game_history.root_values))
+                    ]
+                )
+
+                observations = (
+                    torch.tensor(observations)
+                    .float()
+                    .to(next(self.model.parameters()).device)
+                )
+                values = models.support_to_scalar(
+                    self.model.initial_inference(observations)[0],
+                    self.config.support_size,
+                )
+                game_history.reanalysed_predicted_root_values = (
+                    torch.squeeze(values).detach().cpu().numpy()
+                )
+
+            replay_buffer.update_game_history.remote(game_id, game_history)
+            self.num_reanalysed_games += 1
+            shared_storage.set_info.remote(
+                "num_reanalysed_games", self.num_reanalysed_games
+            )
diff --git a/simplifiedMuZero/self_play_2net.py b/simplifiedMuZero/self_play_2net.py
new file mode 100644
index 00000000..af2a2e39
--- /dev/null
+++ b/simplifiedMuZero/self_play_2net.py
@@ -0,0 +1,622 @@
+import math
+import time
+
+import numpy
+import ray
+import torch
+
+import simplifiedMuZero.models_2net as models
+
+
+@ray.remote
+class SelfPlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        self.model = models.SimplifiedMuZeroNetwork(self.config)
+        # self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+
+    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
+
+            if not test_mode:
+                game_history = self.play_game(
+                    self.config.visit_softmax_temperature_fn(
+                        trained_steps=ray.get(
+                            shared_storage.get_info.remote("training_step")
+                        )
+                    ),
+                    self.config.temperature_threshold,
+                    False,
+                    "self",
+                    0,
+                )
+
+                replay_buffer.save_game.remote(game_history, shared_storage)
+
+            else:
+                # Take the best action (no exploration) in test mode
+                game_history = self.play_game(
+                    0,
+                    self.config.temperature_threshold,
+                    False,
+                    "self" if len(self.config.players) == 1 else self.config.opponent,
+                    self.config.muzero_player,
+                )
+
+                # Save to the shared storage
+                shared_storage.set_info.remote(
+                    {
+                        "episode_length": len(game_history.action_history) - 1,
+                        "total_reward": sum(game_history.reward_history),
+                        "mean_value": numpy.mean(
+                            [value for value in game_history.root_values if value]
+                        ),
+                    }
+                )
+                if 1 < len(self.config.players):
+                    shared_storage.set_info.remote(
+                        {
+                            "muzero_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                == self.config.muzero_player
+                            ),
+                            "opponent_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                != self.config.muzero_player
+                            ),
+                        }
+                    )
+
+            # Managing the self-play / training ratio
+            if not test_mode and self.config.self_play_delay:
+                time.sleep(self.config.self_play_delay)
+            if not test_mode and self.config.ratio:
+                while (
+                    ray.get(shared_storage.get_info.remote("training_step"))
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    < self.config.ratio
+                    and ray.get(shared_storage.get_info.remote("training_step"))
+                    < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+        self.close_game()
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        if render:
+            self.game.render()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+
+# Game independent
+class MCTS:
+    """
+    Core Monte Carlo Tree Search algorithm.
+    To decide on an action, we run N simulations, always starting at the root of
+    the search tree and traversing the tree according to the UCB formula until we
+    reach a leaf node.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
+    def run(
+        self,
+        model,
+        observation,
+        legal_actions,
+        to_play,
+        add_exploration_noise,
+        override_root_with=None,
+    ):
+        """
+        At the root of the search tree we use the representation function to obtain a
+        hidden state given the current observation.
+        We then run a Monte Carlo Tree Search using only action sequences and the model
+        learned by the network.
+        """
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
+            root = override_root_with
+            root_predicted_value = None
+        else:
+            root = Node(0)
+            observation = (
+                torch.tensor(observation)
+                .float()
+                .unsqueeze(0)
+                .to(next(model.parameters()).device)
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
+            (
+                root_predicted_value,
+                reward,
+                policy_logits,
+                hidden_state,
+            ) = model.initial_inference(observation)
+            root_predicted_value = models.support_to_scalar(
+                root_predicted_value, self.config.support_size
+            ).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            assert (
+                legal_actions
+            ), f"Legal actions should not be an empty array. Got {legal_actions}."
+            assert set(legal_actions).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+            root.expand(
+                legal_actions,
+                to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+        if add_exploration_noise:
+            root.add_exploration_noise(
+                dirichlet_alpha=self.config.root_dirichlet_alpha,
+                exploration_fraction=self.config.root_exploration_fraction,
+            )
+
+        min_max_stats = MinMaxStats()
+
+        max_tree_depth = 0
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
+            virtual_to_play = to_play
+            node = root
+            search_path = [node]
+            current_tree_depth = 0
+
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
+                current_tree_depth += 1
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
+
+                # Players play turn by turn
+                if virtual_to_play + 1 < len(self.config.players):
+                    virtual_to_play = self.config.players[virtual_to_play + 1]
+                else:
+                    virtual_to_play = self.config.players[0]
+
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
+            # Inside the search tree we use the dynamics function to obtain the next hidden
+            # state given an action and the previous hidden state
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
+            value, reward, policy_logits, hidden_state = model.recurrent_inference(
+                parent.hidden_state,
+                torch.tensor([[action]]).to(parent.hidden_state.device),
+            )
+            value = models.support_to_scalar(value, self.config.support_size).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
+            node.expand(
+                self.config.action_space,
+                virtual_to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+            self.backpropagate(search_path, value, virtual_to_play, min_max_stats)
+
+            max_tree_depth = max(max_tree_depth, current_tree_depth)
+
+        extra_info = {
+            "max_tree_depth": max_tree_depth,
+            "root_predicted_value": root_predicted_value,
+        }
+        return root, extra_info
+
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
+    def select_child(self, node, min_max_stats):
+        """
+        Select the child with the highest UCB score.
+        """
+        max_ucb = max(
+            self.ucb_score(node, child, min_max_stats)
+            for action, child in node.children.items()
+        )
+        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
+            [
+                action
+                for action, child in node.children.items()
+                if self.ucb_score(node, child, min_max_stats) == max_ucb
+            ]
+        )
+        return action, node.children[action]
+
+    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
+        """
+        The score for a node is based on its value, plus an exploration bonus based on the prior.
+        """
+        pb_c = (
+            math.log(
+                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
+            )
+            + self.config.pb_c_init
+        )
+        pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
+
+        prior_score = pb_c * child.prior # prior 之前的p_value
+        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+        # prior_score = pbc * prior
+
+        if child.visit_count > 0:
+            # Mean value Q
+            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
+                child.reward
+                + self.config.discount # 衰减系数， 之后乘以子节点的值
+                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
+            )
+        else:
+            value_score = 0
+
+        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
+
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
+        """
+        At the end of a simulation, we propagate the evaluation all the way up the tree
+        to the root.
+        """
+        if len(self.config.players) == 1:
+            for node in reversed(search_path):
+                node.value_sum += value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * node.value())
+
+                value = node.reward + self.config.discount * value
+
+        elif len(self.config.players) == 2:
+            for node in reversed(search_path):
+                node.value_sum += value if node.to_play == to_play else -value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * -node.value())
+
+                value = (
+                    -node.reward if node.to_play == to_play else node.reward
+                ) + self.config.discount * value
+
+        else:
+            raise NotImplementedError("More than two player mode not implemented.")
+
+
+class Node:
+    def __init__(self, prior):
+        self.visit_count = 0 #visit count默认是0，只有经过反向传播之后才能变成增加
+        self.to_play = -1
+        self.prior = prior
+        self.value_sum = 0
+        self.children = {}
+        self.hidden_state = None
+        self.reward = 0
+
+    def expanded(self):
+        return len(self.children) > 0
+
+    def value(self):
+        if self.visit_count == 0:
+            return 0
+        return self.value_sum / self.visit_count
+
+    def expand(self, actions, to_play, reward, policy_logits, hidden_state):
+        # expand一层节点，actions是动作列表，policy_logits是rewards列表
+        # 通过该函数，在该节点扩展一层节点
+        """
+        We expand a node using the value, reward and policy prediction obtained from the
+        neural network.
+        """
+        self.to_play = to_play
+        self.reward = reward
+        self.hidden_state = hidden_state
+
+        policy_values = torch.softmax(
+            torch.tensor([policy_logits[0][a] for a in actions]), dim=0
+        ).tolist()
+        policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值
+        for action, p in policy.items():
+            self.children[action] = Node(p)
+
+    def add_exploration_noise(self, dirichlet_alpha, exploration_fraction):
+        """
+        At the start of each search, we add dirichlet noise to the prior of the root to
+        encourage the search to explore new actions.
+        """
+        actions = list(self.children.keys())
+        noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions))
+        frac = exploration_fraction
+        for a, n in zip(actions, noise):
+            self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac
+
+
+class GameHistory:
+    """
+    Store only usefull information of a self-play game.
+    """
+
+    def __init__(self):
+        self.observation_history = []
+        self.action_history = []
+        self.reward_history = []
+        self.to_play_history = []
+        self.child_visits = []
+        self.root_values = []
+        self.reanalysed_predicted_root_values = None
+        # For PER
+        self.priorities = None
+        self.game_priority = None
+
+    def store_search_statistics(self, root, action_space):
+        # Turn visit count from root into a policy
+        if root is not None:
+            sum_visits = sum(child.visit_count for child in root.children.values())
+            self.child_visits.append(
+                [
+                    root.children[a].visit_count / sum_visits
+                    if a in root.children
+                    else 0
+                    for a in action_space
+                ]
+            )
+
+            self.root_values.append(root.value())
+        else:
+            self.root_values.append(None)
+
+    def get_stacked_observations(
+        self, index, num_stacked_observations, action_space_size
+    ): #根据索引index获取observation序列
+        """
+        Generate a new observation with the observation at the index position
+        and num_stacked_observations past observations and actions stacked.
+        """
+        # Convert to positive index
+        index = index % len(self.observation_history)
+
+        stacked_observations = self.observation_history[index].copy() #分为两部分，一部分是当前（current）观察值，一部分是之前的(previous)观察值
+        for past_observation_index in reversed(
+            range(index - num_stacked_observations, index)
+        ):
+            if 0 <= past_observation_index:
+                previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来，方法是依次拆开每个元素，拼接
+                    (
+                        self.observation_history[past_observation_index],
+                        [
+                            numpy.ones_like(stacked_observations[0])
+                            * self.action_history[past_observation_index + 1]
+                            / action_space_size
+                        ],
+                    )
+                )
+            else:
+                previous_observation = numpy.concatenate(
+                    (
+                        numpy.zeros_like(self.observation_history[index]),
+                        [numpy.zeros_like(stacked_observations[0])],
+                    )
+                )
+
+            stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容
+                (stacked_observations, previous_observation)
+            )
+
+        return stacked_observations
+
+
+class MinMaxStats:
+    """
+    A class that holds the min-max values of the tree.
+    """
+
+    def __init__(self):
+        self.maximum = -float("inf") # 最大是-∞
+        self.minimum = float("inf") # 最小是+∞
+        # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围
+
+    def update(self, value): # 更新max和min,方法时对比大小，大的更新为上限，小的更新为下限
+        self.maximum = max(self.maximum, value)
+        self.minimum = min(self.minimum, value)
+
+    def normalize(self, value): #对value规范化，公式为(x-a)/(a-b) 当x∈[a,b]时
+        if self.maximum > self.minimum: # 如果最大大于最小，说明至少更新了两次（第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围）
+            # We normalize only when we have set the maximum and minimum values
+            return (value - self.minimum) / (self.maximum - self.minimum)
+        return value # 如果范围没有更新，就直接返回value
diff --git a/simplifiedMuZero/self_play_without_replay_buffer.py b/simplifiedMuZero/self_play_without_replay_buffer.py
new file mode 100644
index 00000000..89174d92
--- /dev/null
+++ b/simplifiedMuZero/self_play_without_replay_buffer.py
@@ -0,0 +1,624 @@
+import math
+import time
+
+import numpy
+# import ray
+import torch
+
+import models
+
+
+# @ray.remote
+class SelfPlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+
+    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
+
+            if not test_mode:
+                game_history = self.play_game(
+                    self.config.visit_softmax_temperature_fn(
+                        trained_steps=ray.get(
+                            shared_storage.get_info.remote("training_step")
+                        )
+                    ),
+                    self.config.temperature_threshold,
+                    False,
+                    "self",
+                    0,
+                )
+
+                replay_buffer.save_game.remote(game_history, shared_storage)
+
+            else:
+                # Take the best action (no exploration) in test mode
+                game_history = self.play_game(
+                    0,
+                    self.config.temperature_threshold,
+                    False,
+                    "self" if len(self.config.players) == 1 else self.config.opponent,
+                    self.config.muzero_player,
+                )
+
+                # Save to the shared storage
+                shared_storage.set_info.remote(
+                    {
+                        "episode_length": len(game_history.action_history) - 1,
+                        "total_reward": sum(game_history.reward_history),
+                        "mean_value": numpy.mean(
+                            [value for value in game_history.root_values if value]
+                        ),
+                    }
+                )
+                if 1 < len(self.config.players):
+                    shared_storage.set_info.remote(
+                        {
+                            "muzero_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                == self.config.muzero_player
+                            ),
+                            "opponent_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                != self.config.muzero_player
+                            ),
+                        }
+                    )
+
+            # Managing the self-play / training ratio
+            if not test_mode and self.config.self_play_delay:
+                time.sleep(self.config.self_play_delay)
+            if not test_mode and self.config.ratio:
+                while (
+                    ray.get(shared_storage.get_info.remote("training_step"))
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    < self.config.ratio
+                    and ray.get(shared_storage.get_info.remote("training_step"))
+                    < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+        self.close_game()
+
+    # play game 与continuous self play 的区别：
+    #   1. play game 是实际运行游戏，游戏的结果存在game history里，不向replay buffer里写
+    #   2. continuous self play 调用play game，把获取到的game history 异步写进 replay buffer
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        if render:
+            self.game.render()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+
+# Game independent
+class MCTS:
+    """
+    Core Monte Carlo Tree Search algorithm.
+    To decide on an action, we run N simulations, always starting at the root of
+    the search tree and traversing the tree according to the UCB formula until we
+    reach a leaf node.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
+    def run(
+        self,
+        model,
+        observation,
+        legal_actions,
+        to_play,
+        add_exploration_noise,
+        override_root_with=None,
+    ):
+        """
+        At the root of the search tree we use the representation function to obtain a
+        hidden state given the current observation.
+        We then run a Monte Carlo Tree Search using only action sequences and the model
+        learned by the network.
+        """
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
+            root = override_root_with
+            root_predicted_value = None
+        else:
+            root = Node(0)
+            observation = (
+                torch.tensor(observation)
+                .float()
+                .unsqueeze(0)
+                .to(next(model.parameters()).device)
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
+            (
+                root_predicted_value,
+                reward,
+                policy_logits,
+                hidden_state,
+            ) = model.initial_inference(observation)
+            root_predicted_value = models.support_to_scalar(
+                root_predicted_value, self.config.support_size
+            ).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            assert (
+                legal_actions
+            ), f"Legal actions should not be an empty array. Got {legal_actions}."
+            assert set(legal_actions).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+            root.expand(
+                legal_actions,
+                to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+        if add_exploration_noise:
+            root.add_exploration_noise(
+                dirichlet_alpha=self.config.root_dirichlet_alpha,
+                exploration_fraction=self.config.root_exploration_fraction,
+            )
+
+        min_max_stats = MinMaxStats()
+
+        max_tree_depth = 0
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
+            virtual_to_play = to_play
+            node = root
+            search_path = [node]
+            current_tree_depth = 0
+
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
+                current_tree_depth += 1
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
+
+                # Players play turn by turn
+                if virtual_to_play + 1 < len(self.config.players):
+                    virtual_to_play = self.config.players[virtual_to_play + 1]
+                else:
+                    virtual_to_play = self.config.players[0]
+
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
+            # Inside the search tree we use the dynamics function to obtain the next hidden
+            # state given an action and the previous hidden state
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
+            value, reward, policy_logits, hidden_state = model.recurrent_inference(
+                parent.hidden_state,
+                torch.tensor([[action]]).to(parent.hidden_state.device),
+            )
+            value = models.support_to_scalar(value, self.config.support_size).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
+            node.expand(
+                self.config.action_space,
+                virtual_to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+            self.backpropagate(search_path, value, virtual_to_play, min_max_stats)
+
+            max_tree_depth = max(max_tree_depth, current_tree_depth)
+
+        extra_info = {
+            "max_tree_depth": max_tree_depth,
+            "root_predicted_value": root_predicted_value,
+        }
+        return root, extra_info
+
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
+    def select_child(self, node, min_max_stats):
+        """
+        Select the child with the highest UCB score.
+        """
+        max_ucb = max(
+            self.ucb_score(node, child, min_max_stats)
+            for action, child in node.children.items()
+        )
+        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
+            [
+                action
+                for action, child in node.children.items()
+                if self.ucb_score(node, child, min_max_stats) == max_ucb
+            ]
+        )
+        return action, node.children[action]
+
+    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
+        """
+        The score for a node is based on its value, plus an exploration bonus based on the prior.
+        """
+        pb_c = (
+            math.log(
+                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
+            )
+            + self.config.pb_c_init
+        )
+        pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
+
+        prior_score = pb_c * child.prior # prior 之前的p_value
+        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+        # prior_score = pbc * prior
+
+        if child.visit_count > 0:
+            # Mean value Q
+            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
+                child.reward
+                + self.config.discount # 衰减系数， 之后乘以子节点的值
+                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
+            )
+        else:
+            value_score = 0
+
+        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
+
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
+        """
+        At the end of a simulation, we propagate the evaluation all the way up the tree
+        to the root.
+        """
+        if len(self.config.players) == 1:
+            for node in reversed(search_path):
+                node.value_sum += value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * node.value())
+
+                value = node.reward + self.config.discount * value
+
+        elif len(self.config.players) == 2:
+            for node in reversed(search_path):
+                node.value_sum += value if node.to_play == to_play else -value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * -node.value())
+
+                value = (
+                    -node.reward if node.to_play == to_play else node.reward
+                ) + self.config.discount * value
+
+        else:
+            raise NotImplementedError("More than two player mode not implemented.")
+
+
+class Node:
+    def __init__(self, prior):
+        self.visit_count = 0 #visit count默认是0，只有经过反向传播之后才能变成增加
+        self.to_play = -1
+        self.prior = prior
+        self.value_sum = 0
+        self.children = {}
+        self.hidden_state = None
+        self.reward = 0
+
+    def expanded(self):
+        return len(self.children) > 0
+
+    def value(self):
+        if self.visit_count == 0:
+            return 0
+        return self.value_sum / self.visit_count
+
+    def expand(self, actions, to_play, reward, policy_logits, hidden_state):
+        # expand一层节点，actions是动作列表，policy_logits是rewards列表
+        # 通过该函数，在该节点扩展一层节点
+        """
+        We expand a node using the value, reward and policy prediction obtained from the
+        neural network.
+        """
+        self.to_play = to_play
+        self.reward = reward
+        self.hidden_state = hidden_state
+
+        policy_values = torch.softmax(
+            torch.tensor([policy_logits[0][a] for a in actions]), dim=0
+        ).tolist()
+        policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值
+        for action, p in policy.items():
+            self.children[action] = Node(p)
+
+    def add_exploration_noise(self, dirichlet_alpha, exploration_fraction):
+        """
+        At the start of each search, we add dirichlet noise to the prior of the root to
+        encourage the search to explore new actions.
+        """
+        actions = list(self.children.keys())
+        noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions))
+        frac = exploration_fraction
+        for a, n in zip(actions, noise):
+            self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac
+
+
+class GameHistory:
+    """
+    Store only usefull information of a self-play game.
+    """
+
+    def __init__(self):
+        self.observation_history = []
+        self.action_history = []
+        self.reward_history = []
+        self.to_play_history = []
+        self.child_visits = []
+        self.root_values = []
+        self.reanalysed_predicted_root_values = None
+        # For PER
+        self.priorities = None
+        self.game_priority = None
+
+    def store_search_statistics(self, root, action_space):
+        # Turn visit count from root into a policy
+        if root is not None:
+            sum_visits = sum(child.visit_count for child in root.children.values())
+            self.child_visits.append(
+                [
+                    root.children[a].visit_count / sum_visits
+                    if a in root.children
+                    else 0
+                    for a in action_space
+                ]
+            )
+
+            self.root_values.append(root.value())
+        else:
+            self.root_values.append(None)
+
+    def get_stacked_observations(
+        self, index, num_stacked_observations, action_space_size
+    ): #根据索引index获取observation序列
+        """
+        Generate a new observation with the observation at the index position
+        and num_stacked_observations past observations and actions stacked.
+        """
+        # Convert to positive index
+        index = index % len(self.observation_history)
+
+        stacked_observations = self.observation_history[index].copy() #分为两部分，一部分是当前（current）观察值，一部分是之前的(previous)观察值
+        for past_observation_index in reversed(
+            range(index - num_stacked_observations, index)
+        ):
+            if 0 <= past_observation_index:
+                previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来，方法是依次拆开每个元素，拼接
+                    (
+                        self.observation_history[past_observation_index],
+                        [
+                            numpy.ones_like(stacked_observations[0])
+                            * self.action_history[past_observation_index + 1]
+                            / action_space_size
+                        ],
+                    )
+                )
+            else:
+                previous_observation = numpy.concatenate(
+                    (
+                        numpy.zeros_like(self.observation_history[index]),
+                        [numpy.zeros_like(stacked_observations[0])],
+                    )
+                )
+
+            stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容
+                (stacked_observations, previous_observation)
+            )
+
+        return stacked_observations
+
+
+class MinMaxStats:
+    """
+    A class that holds the min-max values of the tree.
+    """
+
+    def __init__(self):
+        self.maximum = -float("inf") # 最大是-∞
+        self.minimum = float("inf") # 最小是+∞
+        # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围
+
+    def update(self, value): # 更新max和min,方法时对比大小，大的更新为上限，小的更新为下限
+        self.maximum = max(self.maximum, value)
+        self.minimum = min(self.minimum, value)
+
+    def normalize(self, value): #对value规范化，公式为(x-a)/(a-b) 当x∈[a,b]时
+        if self.maximum > self.minimum: # 如果最大大于最小，说明至少更新了两次（第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围）
+            # We normalize only when we have set the maximum and minimum values
+            return (value - self.minimum) / (self.maximum - self.minimum)
+        return value # 如果范围没有更新，就直接返回value
diff --git a/simplifiedMuZero/trainer_2net.py b/simplifiedMuZero/trainer_2net.py
new file mode 100644
index 00000000..244fb7ee
--- /dev/null
+++ b/simplifiedMuZero/trainer_2net.py
@@ -0,0 +1,300 @@
+import copy
+import time
+
+import numpy
+import ray
+import torch
+
+import simplifiedMuZero.models_2net as models
+
+
+@ray.remote
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.SimplifiedMuZeroNetwork(self.config)
+        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        if initial_checkpoint["optimizer_state"] is not None:
+            print("Loading optimizer...\n")
+            self.optimizer.load_state_dict(
+                copy.deepcopy(initial_checkpoint["optimizer_state"])
+            )
+
+    def continuous_update_weights(self, replay_buffer, shared_storage):
+        # Wait for the replay buffer to be filled
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        next_batch = replay_buffer.get_batch.remote()
+        # Training loop
+        while self.training_step < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ):
+            index_batch, batch = ray.get(next_batch)
+            next_batch = replay_buffer.get_batch.remote()
+            self.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = self.update_weights(batch)
+
+            if self.config.PER:
+                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
+                replay_buffer.update_priorities.remote(priorities, index_batch)
+
+            # Save to the shared storage
+            if self.training_step % self.config.checkpoint_interval == 0:
+                shared_storage.set_info.remote(
+                    {
+                        "weights": copy.deepcopy(self.model.get_weights()),
+                        "optimizer_state": copy.deepcopy(
+                            models.dict_to_cpu(self.optimizer.state_dict())
+                        ),
+                    }
+                )
+                if self.config.save_model:
+                    shared_storage.save_checkpoint.remote()
+            shared_storage.set_info.remote(
+                {
+                    "training_step": self.training_step,
+                    "lr": self.optimizer.param_groups[0]["lr"],
+                    "total_loss": total_loss,
+                    "value_loss": value_loss,
+                    "reward_loss": reward_loss,
+                    "policy_loss": policy_loss,
+                }
+            )
+
+            # Managing the self-play / training ratio
+            if self.config.training_delay:
+                time.sleep(self.config.training_delay)
+            if self.config.ratio:
+                while (
+                    self.training_step
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    > self.config.ratio
+                    and self.training_step < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        if self.config.PER:
+            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        if self.config.PER:
+            # Correct PER bias by using importance-sampling (IS) weights
+            loss *= weight_batch
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
diff --git a/simplifiedMuZero/trainer_without_replay_buffer.py b/simplifiedMuZero/trainer_without_replay_buffer.py
new file mode 100644
index 00000000..48236e0f
--- /dev/null
+++ b/simplifiedMuZero/trainer_without_replay_buffer.py
@@ -0,0 +1,303 @@
+import copy
+import time
+
+import numpy
+import ray
+import torch
+
+import models
+
+
+@ray.remote
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        if initial_checkpoint["optimizer_state"] is not None:
+            print("Loading optimizer...\n")
+            self.optimizer.load_state_dict(
+                copy.deepcopy(initial_checkpoint["optimizer_state"])
+            )
+
+    # update weights 与 continuous update weights 的区别
+    #   1. update weights 是实际计算更新network的权重
+    #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    def continuous_update_weights(self, replay_buffer, shared_storage):
+        # Wait for the replay buffer to be filled
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        next_batch = replay_buffer.get_batch.remote()
+        # Training loop
+        while self.training_step < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ):
+            index_batch, batch = ray.get(next_batch)
+            next_batch = replay_buffer.get_batch.remote()
+            self.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = self.update_weights(batch)
+
+            if self.config.PER:
+                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
+                replay_buffer.update_priorities.remote(priorities, index_batch)
+
+            # Save to the shared storage
+            if self.training_step % self.config.checkpoint_interval == 0:
+                shared_storage.set_info.remote(
+                    {
+                        "weights": copy.deepcopy(self.model.get_weights()),
+                        "optimizer_state": copy.deepcopy(
+                            models.dict_to_cpu(self.optimizer.state_dict())
+                        ),
+                    }
+                )
+                if self.config.save_model:
+                    shared_storage.save_checkpoint.remote()
+            shared_storage.set_info.remote(
+                {
+                    "training_step": self.training_step,
+                    "lr": self.optimizer.param_groups[0]["lr"],
+                    "total_loss": total_loss,
+                    "value_loss": value_loss,
+                    "reward_loss": reward_loss,
+                    "policy_loss": policy_loss,
+                }
+            )
+
+            # Managing the self-play / training ratio
+            if self.config.training_delay:
+                time.sleep(self.config.training_delay)
+            if self.config.ratio:
+                while (
+                    self.training_step
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    > self.config.ratio
+                    and self.training_step < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        if self.config.PER:
+            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        if self.config.PER:
+            # Correct PER bias by using importance-sampling (IS) weights
+            loss *= weight_batch
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
diff --git a/test/Simple_grid_test.py b/test/Simple_grid_test.py
new file mode 100644
index 00000000..501ac2df
--- /dev/null
+++ b/test/Simple_grid_test.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+from games.simple_grid import Game
+import random
+import time
+
+g = Game()
+observation = g.env.get_observation()
+
+# print(observer)
+for i in range(1000):
+    actions = g.legal_actions()
+    observation, reward, done = g.step(random.choice(actions))
+    # g.render()
+    print(np.array(observation).shape)
+
+    if done:
+        break
+
+
+    # time.sleep(10)
+
+g.close()
diff --git a/test/ray_test.py b/test/ray_test.py
new file mode 100644
index 00000000..7d7f0cf6
--- /dev/null
+++ b/test/ray_test.py
@@ -0,0 +1,20 @@
+import ray
+import time
+
+ray.init()
+
+@ray.remote
+def hello():
+    return "Hello world!"
+
+object_id = hello.remote()
+
+hello = ray.get(object_id)
+
+print(hello)
+
+# time.sleep(100)
+results_ids = [ray.put(i) for i in range(10)]
+print(ray.get(results_ids))
+
+ray.shutdown()
\ No newline at end of file

From d5fc4874b8196616e033f547d7529ad1d47792b0 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Mon, 14 Aug 2023 08:28:37 +0100
Subject: [PATCH 3/9] remove replay buffer

---
 MuZero_No_Replay_Buffer.py                    | 1260 +++++++++++++++++
 muzero_2net.py                                |    8 +-
 ...ffer.py => muzero_without_replay_buffer.py |   26 +-
 replay_buffer.py                              |   29 +-
 self_play.py                                  |    2 +-
 simplifiedMuZero/__init__.py                  |    0
 simplifiedMuZero/{ => net2}/models_2net.py    |   25 +-
 .../replay_buffer_2net.py}                    |    4 +-
 simplifiedMuZero/{ => net2}/self_play_2net.py |    2 +-
 simplifiedMuZero/{ => net2}/trainer_2net.py   |    4 +-
 simplifiedMuZero/{ => search_policy}/RHEA.py  |    0
 simplifiedMuZero/search_policy/__init__.py    |    0
 .../models_without_replay_buffer.py           |    0
 .../self_play_without_replay_buffer.py        |    2 +-
 .../trainer_without_replay_buffer.py          |    6 +-
 test/game_play_test.py                        |  696 +++++++++
 trainer.py                                    |    4 +-
 17 files changed, 2020 insertions(+), 48 deletions(-)
 create mode 100644 MuZero_No_Replay_Buffer.py
 rename simplifiedMuZero/muzero_without_replay_buffer.py => muzero_without_replay_buffer.py (96%)
 create mode 100644 simplifiedMuZero/__init__.py
 rename simplifiedMuZero/{ => net2}/models_2net.py (98%)
 rename simplifiedMuZero/{replay_buffer3.py => net2/replay_buffer_2net.py} (98%)
 rename simplifiedMuZero/{ => net2}/self_play_2net.py (99%)
 rename simplifiedMuZero/{ => net2}/trainer_2net.py (98%)
 rename simplifiedMuZero/{ => search_policy}/RHEA.py (100%)
 create mode 100644 simplifiedMuZero/search_policy/__init__.py
 rename simplifiedMuZero/{ => without_rb}/models_without_replay_buffer.py (100%)
 rename simplifiedMuZero/{ => without_rb}/self_play_without_replay_buffer.py (99%)
 rename simplifiedMuZero/{ => without_rb}/trainer_without_replay_buffer.py (97%)
 create mode 100644 test/game_play_test.py

diff --git a/MuZero_No_Replay_Buffer.py b/MuZero_No_Replay_Buffer.py
new file mode 100644
index 00000000..bf280c71
--- /dev/null
+++ b/MuZero_No_Replay_Buffer.py
@@ -0,0 +1,1260 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+# import simplifiedMuZero.without_rb.models_without_replay_buffer as models
+import models
+# import replay_buffer
+# import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play
+import shared_storage
+# import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer
+from self_play import MCTS, GameHistory
+from muzero import load_model_menu, CPUActor
+
+# training_step是一个全局变量，用来存储现有的运行次数，不要超过游戏config里的training_steps，如30000次
+
+class GamePlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+        self.trained_steps = initial_checkpoint["training_step"]
+        self.terminate = False
+
+    def continuous_self_play(self, test_mode=False):
+    # def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
+        while self.trained_steps < self.config.training_steps and not self.terminate: # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            # 此处不要用set——weights，因为现在移除了replay_buffer，不需要shared_storage了
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
+
+            if not test_mode:
+                # game_history = self.play_game(
+                #     self.config.visit_softmax_temperature_fn(
+                #         trained_steps=ray.get(
+                #             shared_storage.get_info.remote("training_step")
+                #         )
+                #     ),
+                #     self.config.temperature_threshold,
+                #     False,
+                #     "self",
+                #     0,
+                # )
+                game_history = self.play_game(
+                    self.config.visit_softmax_temperature_fn(
+                        self.trained_steps
+                    ),
+                    self.config.temperature_threshold,
+                    False,
+                    "self",
+                    0,
+                )
+
+                # replay_buffer.save_game.remote(game_history, shared_storage)
+                return game_history
+
+            else:
+                # Take the best action (no exploration) in test mode # 在测试模式下采取最佳行动（无探索）
+                game_history = self.play_game(
+                    0,
+                    self.config.temperature_threshold,
+                    False,
+                    "self" if len(self.config.players) == 1 else self.config.opponent,
+                    self.config.muzero_player,
+                )
+
+                # Save to the shared storage
+                shared_storage.set_info.remote(
+                    {
+                        "episode_length": len(game_history.action_history) - 1,
+                        "total_reward": sum(game_history.reward_history),
+                        "mean_value": numpy.mean(
+                            [value for value in game_history.root_values if value]
+                        ),
+                    }
+                )
+                if 1 < len(self.config.players):
+                    shared_storage.set_info.remote(
+                        {
+                            "muzero_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                == self.config.muzero_player
+                            ),
+                            "opponent_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                != self.config.muzero_player
+                            ),
+                        }
+                    )
+
+            # Managing the self-play / training ratio
+            if not test_mode and self.config.self_play_delay:
+                time.sleep(self.config.self_play_delay)
+            if not test_mode and self.config.ratio:
+                while (
+                    ray.get(shared_storage.get_info.remote("training_step"))
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    < self.config.ratio
+                    and ray.get(shared_storage.get_info.remote("training_step"))
+                    < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+        self.close_game()
+
+    # play game 与continuous self play 的区别：
+    #   1. play game 是实际运行游戏，游戏的结果存在game history里，不向replay buffer里写
+    #   2. continuous self play 调用play game，把获取到的game history 异步写进 replay buffer
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        if render:
+            self.game.render()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+class Trainer_without_Replay_Buffer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        if initial_checkpoint["optimizer_state"] is not None:
+            print("Loading optimizer...\n")
+            self.optimizer.load_state_dict(
+                copy.deepcopy(initial_checkpoint["optimizer_state"])
+            )
+
+    # update weights 与 continuous update weights 的区别
+    #   1. update weights 是实际计算更新network的权重
+    #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    def continuous_update_weights(self, replay_buffer, shared_storage):
+        # Wait for the replay buffer to be filled
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        next_batch = replay_buffer.get_batch.remote()
+        # Training loop
+        while self.training_step < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ):
+            index_batch, batch = ray.get(next_batch)
+            next_batch = replay_buffer.get_batch.remote()
+            self.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = self.update_weights(batch)
+
+            if self.config.PER:
+                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
+                replay_buffer.update_priorities.remote(priorities, index_batch)
+
+            # Save to the shared storage
+            if self.training_step % self.config.checkpoint_interval == 0:
+                shared_storage.set_info.remote(
+                    {
+                        "weights": copy.deepcopy(self.model.get_weights()),
+                        "optimizer_state": copy.deepcopy(
+                            models.dict_to_cpu(self.optimizer.state_dict())
+                        ),
+                    }
+                )
+                if self.config.save_model:
+                    shared_storage.save_checkpoint.remote()
+            shared_storage.set_info.remote(
+                {
+                    "training_step": self.training_step,
+                    "lr": self.optimizer.param_groups[0]["lr"],
+                    "total_loss": total_loss,
+                    "value_loss": value_loss,
+                    "reward_loss": reward_loss,
+                    "policy_loss": policy_loss,
+                }
+            )
+
+            # Managing the self-play / training ratio
+            if self.config.training_delay:
+                time.sleep(self.config.training_delay)
+            if self.config.ratio:
+                while (
+                    self.training_step
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    > self.config.ratio
+                    and self.training_step < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        if self.config.PER:
+            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        if self.config.PER:
+            # Correct PER bias by using importance-sampling (IS) weights
+            loss *= weight_batch
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        # 此处才算一次迭代完成，training step加1
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
+
+class MuZero_No_Replay_Buffer:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero_No_Replay_Buffer("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        # cpu_actor = CPUActor.remote()
+        # cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        # 移除ray
+        cpu_actor = CPUActor()
+        cpu_weights = cpu_actor.get_initial_weights(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        # self.training_worker = trainer.Trainer.options(
+        #     num_cpus=0,
+        #     num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        # ).remote(self.checkpoint, self.config)
+        #
+        # self.shared_storage_worker = shared_storage.SharedStorage.remote(
+        #     self.checkpoint,
+        #     self.config,
+        # )
+        # self.shared_storage_worker.set_info.remote("terminate", False)
+        #
+        # self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+        #     self.checkpoint, self.replay_buffer, self.config
+        # )
+
+        # 初始化权重
+        self.training_worker = Trainer_without_Replay_Buffer(self.checkpoint, self.config)
+
+        # #使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
+        # if self.config.use_last_model_value:
+        #     self.reanalyse_worker = replay_buffer.Reanalyse.options(
+        #         num_cpus=0,
+        #         num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+        #     ).remote(self.checkpoint, self.config)
+        #
+        # self.self_play_workers = [
+        #     self_play.SelfPlay.options(
+        #         num_cpus=0,
+        #         num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+        #     ).remote(
+        #         self.checkpoint,
+        #         self.Game,
+        #         self.config,
+        #         self.config.seed + seed,
+        #     )
+        #     for seed in range(self.config.num_workers)
+        # ]
+        #
+        # # 这里调用continuous类的函数，主要是continuous函数会调用replay_buffer，
+        #
+        # # Launch workers
+        # # 此处调用worker进行self play，把结果存在replay_buffer里
+        # [
+        #     self_play_worker.continuous_self_play.remote(
+        #         self.shared_storage_worker, self.replay_buffer_worker
+        #     )
+        #     for self_play_worker in self.self_play_workers
+        # ]
+
+        # # 此处使用trainer，从replay buffer里按batch抽取数据，进行网络训练和更新
+        # self.training_worker.continuous_update_weights.remote(
+        #     self.replay_buffer_worker, self.shared_storage_worker
+        # )
+        self.training_worker.continuous_update_weights(self.replay_buffer_worker, self.shared_storage_worker)
+
+        # # 使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
+        # if self.config.use_last_model_value:
+        #     self.reanalyse_worker.reanalyse.remote(
+        #         self.replay_buffer_worker, self.shared_storage_worker
+        #     )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            # 此处是将replay buffer的结果写入文件保持
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            # pickle用来存储和导入文件，其作用是将对象转换为字符串或者将字符串转换为对象
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            # 此处更新replay buffer的值
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+# @ray.remote(num_cpus=0, num_gpus=0)
+# class CPUActor:
+#     # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+#     def __init__(self):
+#         pass
+#
+#     def get_initial_weights(self, config):
+#         model = models.MuZeroNetwork(config)
+#         weigths = model.get_weights()
+#         summary = str(model).replace("\n", " \n\n")
+#         return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero_No_Replay_Buffer):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero_No_Replay_Buffer(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero_No_Replay_Buffer(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero_No_Replay_Buffer(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero_No_Replay_Buffer(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_2net.py b/muzero_2net.py
index bfdc38b0..d03457ec 100644
--- a/muzero_2net.py
+++ b/muzero_2net.py
@@ -16,11 +16,11 @@
 sys.path.append("")
 
 import diagnose_model
-import simplifiedMuZero.models_2net as models
-import simplifiedMuZero.replay_buffer3 as replay_buffer
-import simplifiedMuZero.self_play_2net as self_play
+import simplifiedMuZero.net2.models_2net as models
+import simplifiedMuZero.net2.replay_buffer_2net as replay_buffer
+import simplifiedMuZero.net2.self_play_2net as self_play
 import shared_storage
-import simplifiedMuZero.trainer_2net as trainer
+import simplifiedMuZero.net2.trainer_2net as trainer
 
 
 class MuZero:
diff --git a/simplifiedMuZero/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py
similarity index 96%
rename from simplifiedMuZero/muzero_without_replay_buffer.py
rename to muzero_without_replay_buffer.py
index 37436e79..e0a63690 100644
--- a/simplifiedMuZero/muzero_without_replay_buffer.py
+++ b/muzero_without_replay_buffer.py
@@ -14,14 +14,14 @@
 from torch.utils.tensorboard import SummaryWriter
 
 import diagnose_model
-import models
-import replay_buffer
-import self_play
+import simplifiedMuZero.without_rb.models_without_replay_buffer as models
+# import replay_buffer
+import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play
 import shared_storage
-import trainer
+import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer
 
 
-class MuZero:
+class MuZero_Without_Replay_Buffer:
     """
     Main class to manage MuZero.
 
@@ -34,7 +34,7 @@ class MuZero:
         split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
 
     Example:
-        >>> muzero = MuZero("cartpole")
+        >>> muzero = MuZero_Without_Replay_Buffer("cartpole")
         >>> muzero.train()
         >>> muzero.test(render=True)
     """
@@ -530,7 +530,7 @@ def hyperparameter_search(
             if 0 < budget:
                 param = optimizer.ask()
                 print(f"Launching new experiment: {param.value}")
-                muzero = MuZero(game_name, param.value, parallel_experiments)
+                muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments)
                 muzero.param = param
                 muzero.train(False)
                 running_experiments.append(muzero)
@@ -556,7 +556,7 @@ def hyperparameter_search(
                     if 0 < budget:
                         param = optimizer.ask()
                         print(f"Launching new experiment: {param.value}")
-                        muzero = MuZero(game_name, param.value, parallel_experiments)
+                        muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments)
                         muzero.param = param
                         muzero.train(False)
                         running_experiments[i] = muzero
@@ -566,7 +566,7 @@ def hyperparameter_search(
 
     except KeyboardInterrupt:
         for experiment in running_experiments:
-            if isinstance(experiment, MuZero):
+            if isinstance(experiment, MuZero_Without_Replay_Buffer):
                 experiment.terminate_workers()
 
     recommendation = optimizer.provide_recommendation()
@@ -630,12 +630,12 @@ def load_model_menu(muzero, game_name):
 if __name__ == "__main__":
     if len(sys.argv) == 2:
         # Train directly with: python muzero.py cartpole
-        muzero = MuZero(sys.argv[1])
+        muzero = MuZero_Without_Replay_Buffer(sys.argv[1])
         muzero.train()
     elif len(sys.argv) == 3:
         # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
         config = json.loads(sys.argv[2])
-        muzero = MuZero(sys.argv[1], config)
+        muzero = MuZero_Without_Replay_Buffer(sys.argv[1], config)
         muzero.train()
     else:
         print("\nWelcome to MuZero! Here's a list of games:")
@@ -655,7 +655,7 @@ def load_model_menu(muzero, game_name):
         # Initialize MuZero
         choice = int(choice)
         game_name = games[choice]
-        muzero = MuZero(game_name)
+        muzero = MuZero_Without_Replay_Buffer(game_name)
 
         while True:
             # Configure running options
@@ -715,7 +715,7 @@ def load_model_menu(muzero, game_name):
                 best_hyperparameters = hyperparameter_search(
                     game_name, parametrization, budget, parallel_experiments, 20
                 )
-                muzero = MuZero(game_name, best_hyperparameters)
+                muzero = MuZero_Without_Replay_Buffer(game_name, best_hyperparameters)
             else:
                 break
             print("\nDone")
diff --git a/replay_buffer.py b/replay_buffer.py
index 81bc813e..cc1115db 100644
--- a/replay_buffer.py
+++ b/replay_buffer.py
@@ -16,7 +16,7 @@ class ReplayBuffer:
 
     def __init__(self, initial_checkpoint, initial_buffer, config):
         self.config = config
-        self.buffer = copy.deepcopy(initial_buffer)
+        self.buffer = copy.deepcopy(initial_buffer) # buffer是一个字典，key是game id，value是game_history
         self.num_played_games = initial_checkpoint["num_played_games"]
         self.num_played_steps = initial_checkpoint["num_played_steps"]
         self.total_samples = sum(
@@ -79,11 +79,14 @@ def get_batch(self):
         ) = ([], [], [], [], [], [], [])
         weight_batch = [] if self.config.PER else None
 
+        # 从buffer里抽取n鸽样本，有probs的话安装probs的概率抽取，没有的话按照uniform抽取
         for game_id, game_history, game_prob in self.sample_n_games(
             self.config.batch_size
         ):
+            # 每个game_history都是一个游戏运行的序列，使用sample_position从这些序列里随机抽取一个位置
             game_pos, pos_prob = self.sample_position(game_history)
 
+            # 计算从该位置开始的值，rewards等数据
             values, rewards, policies, actions = self.make_target(
                 game_history, game_pos
             )
@@ -165,11 +168,11 @@ def sample_n_games(self, n_games, force_uniform=False):
                 game_id_list.append(game_id)
                 game_probs.append(game_history.game_priority)
             game_probs = numpy.array(game_probs, dtype="float32")
-            game_probs /= numpy.sum(game_probs)
+            game_probs /= numpy.sum(game_probs) # 每一个都除以game_probs的总和，可以看成是归一化
             game_prob_dict = dict(
                 [(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)]
             )
-            selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs)
+            selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs) # 抽取n个样本， 抽取的概率是根据game_probs确定的
         else:
             selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
             game_prob_dict = {}
@@ -177,10 +180,11 @@ def sample_n_games(self, n_games, force_uniform=False):
             (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
             for game_id in selected_games
         ]
-        return ret
+        return ret # ret格式为[game_id, game_history, game_prob]
 
     def sample_position(self, game_history, force_uniform=False):
         """
+        统一或根据某些优先级从游戏中采样位置。
         Sample position from game either uniformly or according to some priority.
         See paper appendix Training.
         """
@@ -230,6 +234,8 @@ def update_priorities(self, priorities, index_info):
     def compute_target_value(self, game_history, index):
         # The value target is the discounted root value of the search tree td_steps into the
         # future, plus the discounted sum of all rewards until then.
+        # 价值目标是未来搜索树 td_steps 的折扣根值，加上到那时为止的所有奖励的折扣总和。
+        # 计算公式  ∑r*γ^n
         bootstrap_index = index + self.config.td_steps
         if bootstrap_index < len(game_history.root_values):
             root_values = (
@@ -237,6 +243,8 @@ def compute_target_value(self, game_history, index):
                 if game_history.reanalysed_predicted_root_values is None
                 else game_history.reanalysed_predicted_root_values
             )
+
+            # 检查当前的id和目标id是否一致，如果不一致则取负
             last_step_value = (
                 root_values[bootstrap_index]
                 if game_history.to_play_history[bootstrap_index]
@@ -244,13 +252,15 @@ def compute_target_value(self, game_history, index):
                 else -root_values[bootstrap_index]
             )
 
+            # 计算公式 r*γ^n
             value = last_step_value * self.config.discount**self.config.td_steps
-        else:
+        else: # 因为终点的长度超过了数据，因此设为0
             value = 0
 
         for i, reward in enumerate(
-            game_history.reward_history[index + 1 : bootstrap_index + 1]
+            game_history.reward_history[index + 1 : bootstrap_index + 1] # 获取reward,从index+1到最大（如果长度不够则只会取到最后）
         ):
+            # 根据对手决定正负号，只会累计到value上
             # The value is oriented from the perspective of the current player
             value += (
                 reward
@@ -259,12 +269,13 @@ def compute_target_value(self, game_history, index):
                 else -reward
             ) * self.config.discount**i
 
-        return value
+        return value # 返回value
 
     def make_target(self, game_history, state_index):
         """
         Generate targets for every unroll steps.
         """
+        # target policies 是 策略选择的概率序列，如[[0.4,0.6], [0.5,0.5],...]
         target_values, target_rewards, target_policies, actions = [], [], [], []
         for current_index in range(
             state_index, state_index + self.config.num_unroll_steps + 1
@@ -280,6 +291,7 @@ def make_target(self, game_history, state_index):
                 target_values.append(0)
                 target_rewards.append(game_history.reward_history[current_index])
                 # Uniform policy
+                # 因为是游戏结束的状态，因此选择各个策略的概率是平均分布的
                 target_policies.append(
                     [
                         1 / len(game_history.child_visits[0])
@@ -287,8 +299,9 @@ def make_target(self, game_history, state_index):
                     ]
                 )
                 actions.append(game_history.action_history[current_index])
-            else:
+            else: # 如果current index 大于 game_history的长度
                 # States past the end of games are treated as absorbing states
+                # 游戏结束后的状态被视为吸收状态，因此都为0
                 target_values.append(0)
                 target_rewards.append(0)
                 # Uniform policy
diff --git a/self_play.py b/self_play.py
index d09c5e87..c62802f7 100644
--- a/self_play.py
+++ b/self_play.py
@@ -128,7 +128,7 @@ def play_game(
         game_history.action_history.append(0)
         game_history.observation_history.append(observation) # 添加reset之后的observation
         game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
+        game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的
 
         done = False
 
diff --git a/simplifiedMuZero/__init__.py b/simplifiedMuZero/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/simplifiedMuZero/models_2net.py b/simplifiedMuZero/net2/models_2net.py
similarity index 98%
rename from simplifiedMuZero/models_2net.py
rename to simplifiedMuZero/net2/models_2net.py
index 0a5428df..b62de9db 100644
--- a/simplifiedMuZero/models_2net.py
+++ b/simplifiedMuZero/net2/models_2net.py
@@ -100,20 +100,21 @@ def __init__(
                     stacked_observations + 1) \
                                     + stacked_observations * observation_shape[1] * observation_shape[2]
 
+        # 输出等于输入，即编码维度等于输入维度
         encoding_size = representation_input_size
 
-        self.representation_network = torch.nn.DataParallel(
-            # mlp(
-            #     representation_input_size,
-            #     fc_representation_layers,
-            #     encoding_size,
-            # )
-            mlp(
-                representation_input_size + self.action_space_size,
-                fc_representation_layers,
-                encoding_size,
-            )
-        )
+        # self.representation_network = torch.nn.DataParallel(
+        #     # mlp(
+        #     #     representation_input_size,
+        #     #     fc_representation_layers,
+        #     #     encoding_size,
+        #     # )
+        #     mlp(
+        #         representation_input_size + self.action_space_size,
+        #         fc_representation_layers,
+        #         encoding_size,
+        #     )
+        # )
 
         #dynamics的输入是encoding_size+action_space_size
         self.dynamics_encoded_state_network = torch.nn.DataParallel(
diff --git a/simplifiedMuZero/replay_buffer3.py b/simplifiedMuZero/net2/replay_buffer_2net.py
similarity index 98%
rename from simplifiedMuZero/replay_buffer3.py
rename to simplifiedMuZero/net2/replay_buffer_2net.py
index 762d5a0e..55522b86 100644
--- a/simplifiedMuZero/replay_buffer3.py
+++ b/simplifiedMuZero/net2/replay_buffer_2net.py
@@ -5,7 +5,7 @@
 import ray
 import torch
 
-import simplifiedMuZero.models_2net as models
+import simplifiedMuZero.net2.models_2net as models
 
 
 @ray.remote
@@ -31,7 +31,7 @@ def __init__(self, initial_checkpoint, initial_buffer, config):
         numpy.random.seed(self.config.seed)
 
     def save_game(self, game_history, shared_storage=None):
-        if self.config.PER:
+        if self.config.PER: # config.PER指的是优先重放 Prioritized Replay（参见论文附录训练），优先选择重放缓冲区中网络意外的元素
             if game_history.priorities is not None:
                 # Avoid read only array when loading replay buffer from disk
                 game_history.priorities = numpy.copy(game_history.priorities)
diff --git a/simplifiedMuZero/self_play_2net.py b/simplifiedMuZero/net2/self_play_2net.py
similarity index 99%
rename from simplifiedMuZero/self_play_2net.py
rename to simplifiedMuZero/net2/self_play_2net.py
index af2a2e39..a0a208a8 100644
--- a/simplifiedMuZero/self_play_2net.py
+++ b/simplifiedMuZero/net2/self_play_2net.py
@@ -5,7 +5,7 @@
 import ray
 import torch
 
-import simplifiedMuZero.models_2net as models
+import simplifiedMuZero.net2.models_2net as models
 
 
 @ray.remote
diff --git a/simplifiedMuZero/trainer_2net.py b/simplifiedMuZero/net2/trainer_2net.py
similarity index 98%
rename from simplifiedMuZero/trainer_2net.py
rename to simplifiedMuZero/net2/trainer_2net.py
index 244fb7ee..19888cf2 100644
--- a/simplifiedMuZero/trainer_2net.py
+++ b/simplifiedMuZero/net2/trainer_2net.py
@@ -5,7 +5,7 @@
 import ray
 import torch
 
-import simplifiedMuZero.models_2net as models
+import simplifiedMuZero.net2.models_2net as models
 
 
 @ray.remote
@@ -69,6 +69,8 @@ def continuous_update_weights(self, replay_buffer, shared_storage):
             shared_storage.get_info.remote("terminate")
         ):
             index_batch, batch = ray.get(next_batch)
+            print("train batch size is  :   ", batch[0].shape)
+            print("train index_batch size is  :   ", index_batch.shape)
             next_batch = replay_buffer.get_batch.remote()
             self.update_lr()
             (
diff --git a/simplifiedMuZero/RHEA.py b/simplifiedMuZero/search_policy/RHEA.py
similarity index 100%
rename from simplifiedMuZero/RHEA.py
rename to simplifiedMuZero/search_policy/RHEA.py
diff --git a/simplifiedMuZero/search_policy/__init__.py b/simplifiedMuZero/search_policy/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/simplifiedMuZero/models_without_replay_buffer.py b/simplifiedMuZero/without_rb/models_without_replay_buffer.py
similarity index 100%
rename from simplifiedMuZero/models_without_replay_buffer.py
rename to simplifiedMuZero/without_rb/models_without_replay_buffer.py
diff --git a/simplifiedMuZero/self_play_without_replay_buffer.py b/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py
similarity index 99%
rename from simplifiedMuZero/self_play_without_replay_buffer.py
rename to simplifiedMuZero/without_rb/self_play_without_replay_buffer.py
index 89174d92..7e0d6512 100644
--- a/simplifiedMuZero/self_play_without_replay_buffer.py
+++ b/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py
@@ -5,7 +5,7 @@
 # import ray
 import torch
 
-import models
+import simplifiedMuZero.without_rb.models_without_replay_buffer as models
 
 
 # @ray.remote
diff --git a/simplifiedMuZero/trainer_without_replay_buffer.py b/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py
similarity index 97%
rename from simplifiedMuZero/trainer_without_replay_buffer.py
rename to simplifiedMuZero/without_rb/trainer_without_replay_buffer.py
index 48236e0f..e2f64fa2 100644
--- a/simplifiedMuZero/trainer_without_replay_buffer.py
+++ b/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py
@@ -2,10 +2,10 @@
 import time
 
 import numpy
-import ray
+# import ray
 import torch
 
-import models
+import simplifiedMuZero.without_rb.models_without_replay_buffer as models
 
 
 @ray.remote
@@ -69,7 +69,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage):
         next_batch = replay_buffer.get_batch.remote()
         # Training loop
         while self.training_step < self.config.training_steps and not ray.get(
-            shared_storage.get_info.remote("terminate")
+            shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
         ):
             index_batch, batch = ray.get(next_batch)
             next_batch = replay_buffer.get_batch.remote()
diff --git a/test/game_play_test.py b/test/game_play_test.py
new file mode 100644
index 00000000..60b6a5ec
--- /dev/null
+++ b/test/game_play_test.py
@@ -0,0 +1,696 @@
+from self_play import MCTS, GameHistory
+from games.simple_grid import MuZeroConfig, Game
+# from games.tictactoe import MuZeroConfig, Game
+import models
+
+import numpy
+import torch
+
+import math
+import time
+import copy
+
+class MySelfPlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, model, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        # self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(initial_checkpoint["weights"])
+        self.model = model
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+        self.trained_steps = initial_checkpoint["training_step"]
+        self.terminate = False
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+        game_id = None
+
+        if render:
+            self.game.render()
+
+        game_id = self.game.to_play()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_id, game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+class PlayBuffer:
+    """
+    Class which run in a dedicated thread to store played games and generate batch.
+    """
+
+    def __init__(self, initial_checkpoint, initial_buffer, config):
+        self.config = config
+        self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{}
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
+        )
+        if self.total_samples != 0:
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+
+    def save_game(self, game_history):
+        self.buffer[self.num_played_games] = game_history
+        self.num_played_games += 1
+        self.num_played_steps += len(game_history.root_values)
+        self.total_samples += len(game_history.root_values)
+
+        if self.config.replay_buffer_size < len(self.buffer):
+            del_id = self.num_played_games - len(self.buffer)
+            self.total_samples -= len(self.buffer[del_id].root_values)
+            del self.buffer[del_id]
+
+    def get_buffer(self):
+        return self.buffer
+
+    def get_batch(self):
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [])
+        weight_batch = None
+
+        for game_id, game_history, game_prob in self.sample_n_games(
+            self.config.batch_size
+        ):
+            game_pos, pos_prob = self.sample_position(game_history)
+
+            values, rewards, policies, actions = self.make_target(
+                game_history, game_pos
+            )
+
+            index_batch.append([game_id, game_pos])
+            observation_batch.append(
+                game_history.get_stacked_observations(
+                    game_pos,
+                    self.config.stacked_observations,
+                    len(self.config.action_space),
+                )
+            )
+            action_batch.append(actions)
+            value_batch.append(values)
+            reward_batch.append(rewards)
+            policy_batch.append(policies)
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
+            )
+
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1
+        # value_batch: batch, num_unroll_steps+1
+        # reward_batch: batch, num_unroll_steps+1
+        # policy_batch: batch, num_unroll_steps+1, len(action_space)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
+        )
+
+    def sample_game(self, force_uniform=True): #将force_uniform 设置为True，强制安装平均分布选取
+        """
+        Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        game_prob = None
+
+        game_index = numpy.random.choice(len(self.buffer))
+        game_id = self.num_played_games - len(self.buffer) + game_index
+
+        return game_id, self.buffer[game_id], game_prob
+
+    def sample_n_games(self, n_games):
+        selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+        game_prob_dict = {}
+        ret = [
+            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+            for game_id in selected_games
+        ]
+        return ret
+
+    def sample_position(self, game_history):
+        """
+        Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        position_prob = None
+
+        position_index = numpy.random.choice(len(game_history.root_values))
+
+        return position_index, position_prob
+
+    def update_game_history(self, game_id, game_history):
+        # The element could have been removed since its selection and update
+        # if next(iter(self.buffer)) <= game_id:
+        #     self.buffer[game_id] = game_history
+
+        self.buffer[game_id] = game_history
+
+    def compute_target_value(self, game_history, index):
+        # The value target is the discounted root value of the search tree td_steps into the
+        # future, plus the discounted sum of all rewards until then.
+        bootstrap_index = index + self.config.td_steps
+        if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            last_step_value = (
+                root_values[bootstrap_index]
+                if game_history.to_play_history[bootstrap_index]
+                == game_history.to_play_history[index]
+                else -root_values[bootstrap_index]
+            )
+
+            value = last_step_value * self.config.discount**self.config.td_steps
+        else:
+            value = 0
+
+        for i, reward in enumerate(
+            game_history.reward_history[index + 1 : bootstrap_index + 1]
+        ):
+            # The value is oriented from the perspective of the current player
+            value += (
+                reward
+                if game_history.to_play_history[index]
+                == game_history.to_play_history[index + i]
+                else -reward
+            ) * self.config.discount**i
+
+        return value
+
+    def make_target(self, game_history, state_index):
+        """
+        Generate targets for every unroll steps.
+        """
+        target_values, target_rewards, target_policies, actions = [], [], [], []
+        for current_index in range(
+            state_index, state_index + self.config.num_unroll_steps + 1
+        ):
+            value = self.compute_target_value(game_history, current_index)
+
+            if current_index < len(game_history.root_values):
+                target_values.append(value)
+                target_rewards.append(game_history.reward_history[current_index])
+                target_policies.append(game_history.child_visits[current_index])
+                actions.append(game_history.action_history[current_index])
+            elif current_index == len(game_history.root_values):
+                target_values.append(0)
+                target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(game_history.action_history[current_index])
+            else:
+                # States past the end of games are treated as absorbing states
+                target_values.append(0)
+                target_rewards.append(0)
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(numpy.random.choice(self.config.action_space))
+
+        return target_values, target_rewards, target_policies, actions
+
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        # if initial_checkpoint["optimizer_state"] is not None:
+        #     print("Loading optimizer...\n")
+        #     self.optimizer.load_state_dict(
+        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
+        #     )
+
+    # # update weights 与 continuous update weights 的区别
+    # #   1. update weights 是实际计算更新network的权重
+    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+    #     next_batch = play_buffer.get_batch()
+    #     # Training loop
+    #     while self.training_step < self.config.training_steps and not terminate:
+    #         index_batch, batch = next_batch
+    #         next_batch = play_buffer.get_batch()
+    #         self.update_lr()
+    #         (
+    #             priorities,
+    #             total_loss,
+    #             value_loss,
+    #             reward_loss,
+    #             policy_loss,
+    #         ) = self.update_weights(batch)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+
+    checkpoint = {
+        "weights": None,
+        "optimizer_state": None,
+        "total_reward": 0,
+        "muzero_reward": 0,
+        "opponent_reward": 0,
+        "episode_length": 0,
+        "mean_value": 0,
+        "training_step": 0,
+        "lr": 0,
+        "total_loss": 0,
+        "value_loss": 0,
+        "reward_loss": 0,
+        "policy_loss": 0,
+        "num_played_games": 0,
+        "num_played_steps": 0,
+        "num_reanalysed_games": 0,
+        "terminate": False,
+    }
+
+    trainer = Trainer(checkpoint, config)
+    selfplay = MySelfPlay(trainer.model, checkpoint, Game, config, config.seed)
+    buffer = {}
+    play_buffer = PlayBuffer(checkpoint, buffer, config)
+    for i in range(config.training_steps):
+        game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0)
+
+        # print(game_id)
+        # print(game_history.action_history)
+        # print(game_history.reward_history)
+        # print(game_history.to_play_history)
+        # # print(game_history.observation_history)
+        # print("child visits", game_history.child_visits)
+        # print(game_history.root_values) # root value指的是root节点的UCB值
+
+        # buffer[game_id] = game_history
+
+        play_buffer.update_game_history(game_id, game_history)
+
+        for i in range(10):
+            index_batch, batch = play_buffer.get_batch()
+            # print(batch[1])
+            trainer.update_lr()
+            trainer.update_weights(batch)
+
+    selfplay.close_game()
+
+
diff --git a/trainer.py b/trainer.py
index faa5f941..3e035c51 100644
--- a/trainer.py
+++ b/trainer.py
@@ -66,7 +66,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage):
         next_batch = replay_buffer.get_batch.remote()
         # Training loop
         while self.training_step < self.config.training_steps and not ray.get(
-            shared_storage.get_info.remote("terminate")
+            shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
         ):
             index_batch, batch = ray.get(next_batch)
             next_batch = replay_buffer.get_batch.remote()
@@ -117,7 +117,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage):
                     )
                     > self.config.ratio
                     and self.training_step < self.config.training_steps
-                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                    and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
                 ):
                     time.sleep(0.5)
 

From 98f8b05dffcaba7954b67b0371342b142ca40f58 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Wed, 16 Aug 2023 22:09:30 +0100
Subject: [PATCH 4/9] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E7=BB=93=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MuZero_No_Replay_Buffer.py                    | 1260 --------------
 game_tournament.py                            |  221 +++
 games/tictactoe.py                            |    3 +-
 muzero_2net.py                                |   20 +-
 muzero_uniform.py                             |  719 ++++++++
 muzero_without_replay_buffer.py               | 1443 +++++++++--------
 muzero_without_replay_buffer2.py              |  417 +++++
 muzero_without_replay_buffer_tictactoe.py     |  242 +++
 simplifiedMuZero/net2/trainer_2net.py         |    2 -
 .../self_play_uniform_search.py}              |   94 +-
 simplifiedMuZero/without_rb/game_play.py      |  182 +++
 .../models_without_replay_buffer.py           |  696 --------
 simplifiedMuZero/without_rb/play_buffer.py    |  214 +++
 ...er_without_replay_buffer.py => trainer.py} |  120 +-
 test/game_play_test.py                        |   10 +-
 test/mcts_test.py                             |  245 +++
 test/muzero_config_test.py                    |    6 +
 trainer.py                                    |    2 +-
 18 files changed, 3140 insertions(+), 2756 deletions(-)
 delete mode 100644 MuZero_No_Replay_Buffer.py
 create mode 100644 game_tournament.py
 create mode 100644 muzero_uniform.py
 create mode 100644 muzero_without_replay_buffer2.py
 create mode 100644 muzero_without_replay_buffer_tictactoe.py
 rename simplifiedMuZero/{without_rb/self_play_without_replay_buffer.py => search_policy/self_play_uniform_search.py} (91%)
 create mode 100644 simplifiedMuZero/without_rb/game_play.py
 delete mode 100644 simplifiedMuZero/without_rb/models_without_replay_buffer.py
 create mode 100644 simplifiedMuZero/without_rb/play_buffer.py
 rename simplifiedMuZero/without_rb/{trainer_without_replay_buffer.py => trainer.py} (67%)
 create mode 100644 test/mcts_test.py
 create mode 100644 test/muzero_config_test.py

diff --git a/MuZero_No_Replay_Buffer.py b/MuZero_No_Replay_Buffer.py
deleted file mode 100644
index bf280c71..00000000
--- a/MuZero_No_Replay_Buffer.py
+++ /dev/null
@@ -1,1260 +0,0 @@
-import copy
-import importlib
-import json
-import math
-import pathlib
-import pickle
-import sys
-import time
-
-import nevergrad
-import numpy
-import ray
-import torch
-from torch.utils.tensorboard import SummaryWriter
-
-import diagnose_model
-# import simplifiedMuZero.without_rb.models_without_replay_buffer as models
-import models
-# import replay_buffer
-# import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play
-import shared_storage
-# import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer
-from self_play import MCTS, GameHistory
-from muzero import load_model_menu, CPUActor
-
-# training_step是一个全局变量，用来存储现有的运行次数，不要超过游戏config里的training_steps，如30000次
-
-class GamePlay:
-    """
-    Class which run in a dedicated thread to play games and save them to the replay-buffer.
-    """
-
-    def __init__(self, initial_checkpoint, Game, config, seed):
-        self.config = config
-        self.game = Game(seed)
-
-        # Fix random generator seed
-        numpy.random.seed(seed)
-        torch.manual_seed(seed)
-
-        # Initialize the network
-        self.model = models.MuZeroNetwork(self.config)
-        self.model.set_weights(initial_checkpoint["weights"])
-        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
-        self.model.eval()
-        self.trained_steps = initial_checkpoint["training_step"]
-        self.terminate = False
-
-    def continuous_self_play(self, test_mode=False):
-    # def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
-        while self.trained_steps < self.config.training_steps and not self.terminate: # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
-            # 此处不要用set——weights，因为现在移除了replay_buffer，不需要shared_storage了
-            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
-
-            if not test_mode:
-                # game_history = self.play_game(
-                #     self.config.visit_softmax_temperature_fn(
-                #         trained_steps=ray.get(
-                #             shared_storage.get_info.remote("training_step")
-                #         )
-                #     ),
-                #     self.config.temperature_threshold,
-                #     False,
-                #     "self",
-                #     0,
-                # )
-                game_history = self.play_game(
-                    self.config.visit_softmax_temperature_fn(
-                        self.trained_steps
-                    ),
-                    self.config.temperature_threshold,
-                    False,
-                    "self",
-                    0,
-                )
-
-                # replay_buffer.save_game.remote(game_history, shared_storage)
-                return game_history
-
-            else:
-                # Take the best action (no exploration) in test mode # 在测试模式下采取最佳行动（无探索）
-                game_history = self.play_game(
-                    0,
-                    self.config.temperature_threshold,
-                    False,
-                    "self" if len(self.config.players) == 1 else self.config.opponent,
-                    self.config.muzero_player,
-                )
-
-                # Save to the shared storage
-                shared_storage.set_info.remote(
-                    {
-                        "episode_length": len(game_history.action_history) - 1,
-                        "total_reward": sum(game_history.reward_history),
-                        "mean_value": numpy.mean(
-                            [value for value in game_history.root_values if value]
-                        ),
-                    }
-                )
-                if 1 < len(self.config.players):
-                    shared_storage.set_info.remote(
-                        {
-                            "muzero_reward": sum(
-                                reward
-                                for i, reward in enumerate(game_history.reward_history)
-                                if game_history.to_play_history[i - 1]
-                                == self.config.muzero_player
-                            ),
-                            "opponent_reward": sum(
-                                reward
-                                for i, reward in enumerate(game_history.reward_history)
-                                if game_history.to_play_history[i - 1]
-                                != self.config.muzero_player
-                            ),
-                        }
-                    )
-
-            # Managing the self-play / training ratio
-            if not test_mode and self.config.self_play_delay:
-                time.sleep(self.config.self_play_delay)
-            if not test_mode and self.config.ratio:
-                while (
-                    ray.get(shared_storage.get_info.remote("training_step"))
-                    / max(
-                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
-                    )
-                    < self.config.ratio
-                    and ray.get(shared_storage.get_info.remote("training_step"))
-                    < self.config.training_steps
-                    and not ray.get(shared_storage.get_info.remote("terminate"))
-                ):
-                    time.sleep(0.5)
-
-        self.close_game()
-
-    # play game 与continuous self play 的区别：
-    #   1. play game 是实际运行游戏，游戏的结果存在game history里，不向replay buffer里写
-    #   2. continuous self play 调用play game，把获取到的game history 异步写进 replay buffer
-    #play game 运行
-    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
-    # 运行步骤：
-    #   1. 创建GameHistory用来存储数据
-    #   2. 检查游戏是否结束或者到底最大移动次数
-    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
-    #   4. 运行MCTS搜索下一步的action
-    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
-    #   6. 持续运行2-5步直到结束
-    #   7. 返回GameHistory
-    def play_game(
-        self, temperature, temperature_threshold, render, opponent, muzero_player
-    ):
-        """
-        Play one game with actions based on the Monte Carlo tree search at each moves.
-        """
-        game_history = GameHistory()
-        observation = self.game.reset()
-        game_history.action_history.append(0)
-        game_history.observation_history.append(observation) # 添加reset之后的observation
-        game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
-
-        done = False
-
-        if render:
-            self.game.render()
-
-        with torch.no_grad():
-            while (
-                not done and len(game_history.action_history) <= self.config.max_moves
-            ): # 游戏没有结束且运行步数小于最大移动步长
-                assert (
-                    len(numpy.array(observation).shape) == 3
-                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
-                assert (
-                    numpy.array(observation).shape == self.config.observation_shape
-                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
-                stacked_observations = game_history.get_stacked_observations(
-                    -1, self.config.stacked_observations, len(self.config.action_space)
-                )
-                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
-                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
-
-                # 一下的if-else部分主要是为了选择一个动作
-                # Choose the action
-                if opponent == "self" or muzero_player == self.game.to_play():
-                    root, mcts_info = MCTS(self.config).run(
-                        self.model,
-                        stacked_observations,
-                        self.game.legal_actions(),
-                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
-                        True,
-                    )
-                    action = self.select_action(
-                        root,
-                        temperature
-                        if not temperature_threshold
-                        or len(game_history.action_history) < temperature_threshold
-                        else 0,
-                    ) # 根据temperature选择动作
-
-                    if render:
-                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
-                        print(
-                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
-                        )
-                else:
-                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
-                        opponent, stacked_observations
-                    )
-
-                observation, reward, done = self.game.step(action) # 运行游戏
-
-                if render:
-                    print(f"Played action: {self.game.action_to_string(action)}")
-                    self.game.render()
-
-                game_history.store_search_statistics(root, self.config.action_space)
-
-                # Next batch
-                game_history.action_history.append(action)
-                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
-                game_history.reward_history.append(reward)
-                game_history.to_play_history.append(self.game.to_play())
-
-        return game_history
-
-    def close_game(self):
-        self.game.close()
-
-    def select_opponent_action(self, opponent, stacked_observations):
-        """
-        Select opponent action for evaluating MuZero level.
-        """
-        if opponent == "human":
-            root, mcts_info = MCTS(self.config).run(
-                self.model,
-                stacked_observations,
-                self.game.legal_actions(),
-                self.game.to_play(),
-                True,
-            )
-            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
-            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
-            print(
-                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
-            )
-            return self.game.human_to_action(), root
-        elif opponent == "expert":
-            return self.game.expert_agent(), None
-        elif opponent == "random":
-            assert (
-                self.game.legal_actions()
-            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
-            assert set(self.game.legal_actions()).issubset(
-                set(self.config.action_space)
-            ), "Legal actions should be a subset of the action space."
-
-            return numpy.random.choice(self.game.legal_actions()), None
-        else:
-            raise NotImplementedError(
-                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
-            )
-
-    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
-    # 公式为 c^(1/t)。可以看到：
-    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
-    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
-    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
-    @staticmethod # 静态方法修饰符，类似于static关键字
-    def select_action(node, temperature):
-        """
-        Select action according to the visit count distribution and the temperature.
-        The temperature is changed dynamically with the visit_softmax_temperature function
-        in the config.
-        """
-        visit_counts = numpy.array(
-            [child.visit_count for child in node.children.values()], dtype="int32"
-        )
-        actions = [action for action in node.children.keys()]
-        if temperature == 0:
-            action = actions[numpy.argmax(visit_counts)]
-        elif temperature == float("inf"):
-            action = numpy.random.choice(actions)
-        else:
-            # See paper appendix Data Generation
-            visit_count_distribution = visit_counts ** (1 / temperature)
-            visit_count_distribution = visit_count_distribution / sum(
-                visit_count_distribution
-            )
-            action = numpy.random.choice(actions, p=visit_count_distribution)
-
-        return action
-
-class Trainer_without_Replay_Buffer:
-    """
-    Class which run in a dedicated thread to train a neural network and save it
-    in the shared storage.
-    """
-
-    def __init__(self, initial_checkpoint, config):
-        self.config = config
-
-        # Fix random generator seed
-        numpy.random.seed(self.config.seed)
-        torch.manual_seed(self.config.seed)
-
-        # Initialize the network
-        self.model = models.MuZeroNetwork(self.config)
-        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
-        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
-        self.model.train()
-
-        self.training_step = initial_checkpoint["training_step"]
-
-        if "cuda" not in str(next(self.model.parameters()).device):
-            print("You are not training on GPU.\n")
-
-        # Initialize the optimizer
-        if self.config.optimizer == "SGD":
-            self.optimizer = torch.optim.SGD(
-                self.model.parameters(),
-                lr=self.config.lr_init,
-                momentum=self.config.momentum,
-                weight_decay=self.config.weight_decay,
-            )
-        elif self.config.optimizer == "Adam":
-            self.optimizer = torch.optim.Adam(
-                self.model.parameters(),
-                lr=self.config.lr_init,
-                weight_decay=self.config.weight_decay,
-            )
-        else:
-            raise NotImplementedError(
-                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
-            )
-
-        if initial_checkpoint["optimizer_state"] is not None:
-            print("Loading optimizer...\n")
-            self.optimizer.load_state_dict(
-                copy.deepcopy(initial_checkpoint["optimizer_state"])
-            )
-
-    # update weights 与 continuous update weights 的区别
-    #   1. update weights 是实际计算更新network的权重
-    #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
-    def continuous_update_weights(self, replay_buffer, shared_storage):
-        # Wait for the replay buffer to be filled
-        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
-            time.sleep(0.1)
-
-        next_batch = replay_buffer.get_batch.remote()
-        # Training loop
-        while self.training_step < self.config.training_steps and not ray.get(
-            shared_storage.get_info.remote("terminate")
-        ):
-            index_batch, batch = ray.get(next_batch)
-            next_batch = replay_buffer.get_batch.remote()
-            self.update_lr()
-            (
-                priorities,
-                total_loss,
-                value_loss,
-                reward_loss,
-                policy_loss,
-            ) = self.update_weights(batch)
-
-            if self.config.PER:
-                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
-                replay_buffer.update_priorities.remote(priorities, index_batch)
-
-            # Save to the shared storage
-            if self.training_step % self.config.checkpoint_interval == 0:
-                shared_storage.set_info.remote(
-                    {
-                        "weights": copy.deepcopy(self.model.get_weights()),
-                        "optimizer_state": copy.deepcopy(
-                            models.dict_to_cpu(self.optimizer.state_dict())
-                        ),
-                    }
-                )
-                if self.config.save_model:
-                    shared_storage.save_checkpoint.remote()
-            shared_storage.set_info.remote(
-                {
-                    "training_step": self.training_step,
-                    "lr": self.optimizer.param_groups[0]["lr"],
-                    "total_loss": total_loss,
-                    "value_loss": value_loss,
-                    "reward_loss": reward_loss,
-                    "policy_loss": policy_loss,
-                }
-            )
-
-            # Managing the self-play / training ratio
-            if self.config.training_delay:
-                time.sleep(self.config.training_delay)
-            if self.config.ratio:
-                while (
-                    self.training_step
-                    / max(
-                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
-                    )
-                    > self.config.ratio
-                    and self.training_step < self.config.training_steps
-                    and not ray.get(shared_storage.get_info.remote("terminate"))
-                ):
-                    time.sleep(0.5)
-
-    def update_weights(self, batch):
-        """
-        Perform one training step.
-        """
-
-        (
-            observation_batch,
-            action_batch,
-            target_value,
-            target_reward,
-            target_policy,
-            weight_batch,
-            gradient_scale_batch,
-        ) = batch
-
-        # Keep values as scalars for calculating the priorities for the prioritized replay
-        target_value_scalar = numpy.array(target_value, dtype="float32")
-        priorities = numpy.zeros_like(target_value_scalar)
-
-        device = next(self.model.parameters()).device
-        if self.config.PER:
-            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
-        observation_batch = (
-            torch.tensor(numpy.array(observation_batch)).float().to(device)
-        )
-        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
-        target_value = torch.tensor(target_value).float().to(device)
-        target_reward = torch.tensor(target_reward).float().to(device)
-        target_policy = torch.tensor(target_policy).float().to(device)
-        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
-        # observation_batch: batch, channels, height, width
-        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
-        # target_value: batch, num_unroll_steps+1
-        # target_reward: batch, num_unroll_steps+1
-        # target_policy: batch, num_unroll_steps+1, len(action_space)
-        # gradient_scale_batch: batch, num_unroll_steps+1
-
-        target_value = models.scalar_to_support(target_value, self.config.support_size)
-        target_reward = models.scalar_to_support(
-            target_reward, self.config.support_size
-        )
-        # target_value: batch, num_unroll_steps+1, 2*support_size+1
-        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
-
-        ## Generate predictions
-        value, reward, policy_logits, hidden_state = self.model.initial_inference(
-            observation_batch
-        )
-        predictions = [(value, reward, policy_logits)]
-        for i in range(1, action_batch.shape[1]):
-            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
-                hidden_state, action_batch[:, i]
-            )
-            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
-            hidden_state.register_hook(lambda grad: grad * 0.5)
-            predictions.append((value, reward, policy_logits))
-        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
-
-        ## Compute losses
-        value_loss, reward_loss, policy_loss = (0, 0, 0)
-        value, reward, policy_logits = predictions[0]
-        # Ignore reward loss for the first batch step
-        current_value_loss, _, current_policy_loss = self.loss_function(
-            value.squeeze(-1),
-            reward.squeeze(-1),
-            policy_logits,
-            target_value[:, 0],
-            target_reward[:, 0],
-            target_policy[:, 0],
-        )
-        value_loss += current_value_loss
-        policy_loss += current_policy_loss
-        # Compute priorities for the prioritized replay (See paper appendix Training)
-        pred_value_scalar = (
-            models.support_to_scalar(value, self.config.support_size)
-            .detach()
-            .cpu()
-            .numpy()
-            .squeeze()
-        )
-        priorities[:, 0] = (
-            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
-            ** self.config.PER_alpha
-        )
-
-        for i in range(1, len(predictions)):
-            value, reward, policy_logits = predictions[i]
-            (
-                current_value_loss,
-                current_reward_loss,
-                current_policy_loss,
-            ) = self.loss_function(
-                value.squeeze(-1),
-                reward.squeeze(-1),
-                policy_logits,
-                target_value[:, i],
-                target_reward[:, i],
-                target_policy[:, i],
-            )
-
-            # Scale gradient by the number of unroll steps (See paper appendix Training)
-            current_value_loss.register_hook(
-                lambda grad: grad / gradient_scale_batch[:, i]
-            )
-            current_reward_loss.register_hook(
-                lambda grad: grad / gradient_scale_batch[:, i]
-            )
-            current_policy_loss.register_hook(
-                lambda grad: grad / gradient_scale_batch[:, i]
-            )
-
-            value_loss += current_value_loss
-            reward_loss += current_reward_loss
-            policy_loss += current_policy_loss
-
-            # Compute priorities for the prioritized replay (See paper appendix Training)
-            pred_value_scalar = (
-                models.support_to_scalar(value, self.config.support_size)
-                .detach()
-                .cpu()
-                .numpy()
-                .squeeze()
-            )
-            priorities[:, i] = (
-                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
-                ** self.config.PER_alpha
-            )
-
-        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
-        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
-        if self.config.PER:
-            # Correct PER bias by using importance-sampling (IS) weights
-            loss *= weight_batch
-        # Mean over batch dimension (pseudocode do a sum)
-        loss = loss.mean()
-
-        # Optimize
-        self.optimizer.zero_grad()
-        loss.backward()
-        self.optimizer.step()
-        # 此处才算一次迭代完成，training step加1
-        self.training_step += 1
-
-        return (
-            priorities,
-            # For log purpose
-            loss.item(),
-            value_loss.mean().item(),
-            reward_loss.mean().item(),
-            policy_loss.mean().item(),
-        )
-
-    def update_lr(self):
-        """
-        Update learning rate
-        """
-        lr = self.config.lr_init * self.config.lr_decay_rate ** (
-            self.training_step / self.config.lr_decay_steps
-        )
-        for param_group in self.optimizer.param_groups:
-            param_group["lr"] = lr
-
-    @staticmethod
-    def loss_function(
-        value,
-        reward,
-        policy_logits,
-        target_value,
-        target_reward,
-        target_policy,
-    ):
-        # Cross-entropy seems to have a better convergence than MSE
-        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
-        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
-        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
-            1
-        )
-        return value_loss, reward_loss, policy_loss
-
-class MuZero_No_Replay_Buffer:
-    """
-    Main class to manage MuZero.
-
-    Args:
-        game_name (str): Name of the game module, it should match the name of a .py file
-        in the "./games" directory.
-
-        config (dict, MuZeroConfig, optional): Override the default config of the game.
-
-        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
-
-    Example:
-        >>> muzero = MuZero_No_Replay_Buffer("cartpole")
-        >>> muzero.train()
-        >>> muzero.test(render=True)
-    """
-
-    def __init__(self, game_name, config=None, split_resources_in=1):
-        # Load the game and the config from the module with the game name
-        try:
-            game_module = importlib.import_module("games." + game_name)
-            print("games." + game_name)
-            self.Game = game_module.Game
-            self.config = game_module.MuZeroConfig()
-        except ModuleNotFoundError as err:
-            print(
-                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
-            )
-            raise err
-
-        # Overwrite the config
-        if config:
-            if type(config) is dict:
-                for param, value in config.items():
-                    if hasattr(self.config, param):
-                        setattr(self.config, param, value)
-                    else:
-                        raise AttributeError(
-                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
-                        )
-            else:
-                self.config = config
-
-        # Fix random generator seed
-        numpy.random.seed(self.config.seed)
-        torch.manual_seed(self.config.seed)
-
-        # Manage GPUs
-        if self.config.max_num_gpus == 0 and (
-            self.config.selfplay_on_gpu
-            or self.config.train_on_gpu
-            or self.config.reanalyse_on_gpu
-        ):
-            raise ValueError(
-                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
-            )
-        if (
-            self.config.selfplay_on_gpu
-            or self.config.train_on_gpu
-            or self.config.reanalyse_on_gpu
-        ):
-            total_gpus = (
-                self.config.max_num_gpus
-                if self.config.max_num_gpus is not None
-                else torch.cuda.device_count()
-            )
-        else:
-            total_gpus = 0
-        self.num_gpus = total_gpus / split_resources_in
-        if 1 < self.num_gpus:
-            self.num_gpus = math.floor(self.num_gpus)
-
-        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
-
-        # Checkpoint and replay buffer used to initialize workers
-        self.checkpoint = {
-            "weights": None,
-            "optimizer_state": None,
-            "total_reward": 0,
-            "muzero_reward": 0,
-            "opponent_reward": 0,
-            "episode_length": 0,
-            "mean_value": 0,
-            "training_step": 0,
-            "lr": 0,
-            "total_loss": 0,
-            "value_loss": 0,
-            "reward_loss": 0,
-            "policy_loss": 0,
-            "num_played_games": 0,
-            "num_played_steps": 0,
-            "num_reanalysed_games": 0,
-            "terminate": False,
-        }
-        self.replay_buffer = {}
-
-        # cpu_actor = CPUActor.remote()
-        # cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
-        # 移除ray
-        cpu_actor = CPUActor()
-        cpu_weights = cpu_actor.get_initial_weights(self.config)
-        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
-
-        # Workers
-        self.self_play_workers = None
-        self.test_worker = None
-        self.training_worker = None
-        self.reanalyse_worker = None
-        self.replay_buffer_worker = None
-        self.shared_storage_worker = None
-
-    def train(self, log_in_tensorboard=True):
-        """
-        Spawn ray workers and launch the training.
-
-        Args:
-            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
-        """
-        if log_in_tensorboard or self.config.save_model:
-            self.config.results_path.mkdir(parents=True, exist_ok=True)
-
-        # Manage GPUs
-        if 0 < self.num_gpus:
-            num_gpus_per_worker = self.num_gpus / (
-                self.config.train_on_gpu
-                + self.config.num_workers * self.config.selfplay_on_gpu
-                + log_in_tensorboard * self.config.selfplay_on_gpu
-                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
-            )
-            if 1 < num_gpus_per_worker:
-                num_gpus_per_worker = math.floor(num_gpus_per_worker)
-        else:
-            num_gpus_per_worker = 0
-
-        # Initialize workers
-        # self.training_worker = trainer.Trainer.options(
-        #     num_cpus=0,
-        #     num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
-        # ).remote(self.checkpoint, self.config)
-        #
-        # self.shared_storage_worker = shared_storage.SharedStorage.remote(
-        #     self.checkpoint,
-        #     self.config,
-        # )
-        # self.shared_storage_worker.set_info.remote("terminate", False)
-        #
-        # self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
-        #     self.checkpoint, self.replay_buffer, self.config
-        # )
-
-        # 初始化权重
-        self.training_worker = Trainer_without_Replay_Buffer(self.checkpoint, self.config)
-
-        # #使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
-        # if self.config.use_last_model_value:
-        #     self.reanalyse_worker = replay_buffer.Reanalyse.options(
-        #         num_cpus=0,
-        #         num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
-        #     ).remote(self.checkpoint, self.config)
-        #
-        # self.self_play_workers = [
-        #     self_play.SelfPlay.options(
-        #         num_cpus=0,
-        #         num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
-        #     ).remote(
-        #         self.checkpoint,
-        #         self.Game,
-        #         self.config,
-        #         self.config.seed + seed,
-        #     )
-        #     for seed in range(self.config.num_workers)
-        # ]
-        #
-        # # 这里调用continuous类的函数，主要是continuous函数会调用replay_buffer，
-        #
-        # # Launch workers
-        # # 此处调用worker进行self play，把结果存在replay_buffer里
-        # [
-        #     self_play_worker.continuous_self_play.remote(
-        #         self.shared_storage_worker, self.replay_buffer_worker
-        #     )
-        #     for self_play_worker in self.self_play_workers
-        # ]
-
-        # # 此处使用trainer，从replay buffer里按batch抽取数据，进行网络训练和更新
-        # self.training_worker.continuous_update_weights.remote(
-        #     self.replay_buffer_worker, self.shared_storage_worker
-        # )
-        self.training_worker.continuous_update_weights(self.replay_buffer_worker, self.shared_storage_worker)
-
-        # # 使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
-        # if self.config.use_last_model_value:
-        #     self.reanalyse_worker.reanalyse.remote(
-        #         self.replay_buffer_worker, self.shared_storage_worker
-        #     )
-
-        if log_in_tensorboard:
-            self.logging_loop(
-                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
-            )
-
-    def logging_loop(self, num_gpus):
-        """
-        Keep track of the training performance.
-        """
-        # Launch the test worker to get performance metrics
-        self.test_worker = self_play.SelfPlay.options(
-            num_cpus=0,
-            num_gpus=num_gpus,
-        ).remote(
-            self.checkpoint,
-            self.Game,
-            self.config,
-            self.config.seed + self.config.num_workers,
-        )
-        self.test_worker.continuous_self_play.remote(
-            self.shared_storage_worker, None, True
-        )
-
-        # Write everything in TensorBoard
-        writer = SummaryWriter(self.config.results_path)
-
-        print(
-            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
-        )
-
-        # Save hyperparameters to TensorBoard
-        hp_table = [
-            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
-        ]
-        writer.add_text(
-            "Hyperparameters",
-            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
-        )
-        # Save model representation
-        writer.add_text(
-            "Model summary",
-            self.summary,
-        )
-        # Loop for updating the training performance
-        counter = 0
-        keys = [
-            "total_reward",
-            "muzero_reward",
-            "opponent_reward",
-            "episode_length",
-            "mean_value",
-            "training_step",
-            "lr",
-            "total_loss",
-            "value_loss",
-            "reward_loss",
-            "policy_loss",
-            "num_played_games",
-            "num_played_steps",
-            "num_reanalysed_games",
-        ]
-        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
-        try:
-            while info["training_step"] < self.config.training_steps:
-                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
-                writer.add_scalar(
-                    "1.Total_reward/1.Total_reward",
-                    info["total_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/2.Mean_value",
-                    info["mean_value"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/3.Episode_length",
-                    info["episode_length"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/4.MuZero_reward",
-                    info["muzero_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/5.Opponent_reward",
-                    info["opponent_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/1.Self_played_games",
-                    info["num_played_games"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/2.Training_steps", info["training_step"], counter
-                )
-                writer.add_scalar(
-                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
-                )
-                writer.add_scalar(
-                    "2.Workers/4.Reanalysed_games",
-                    info["num_reanalysed_games"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
-                    info["training_step"] / max(1, info["num_played_steps"]),
-                    counter,
-                )
-                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
-                writer.add_scalar(
-                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
-                )
-                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
-                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
-                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
-                print(
-                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
-                    end="\r",
-                )
-                counter += 1
-                time.sleep(0.5)
-        except KeyboardInterrupt:
-            pass
-
-        self.terminate_workers()
-
-        if self.config.save_model:
-            # Persist replay buffer to disk
-            path = self.config.results_path / "replay_buffer.pkl"
-            print(f"\n\nPersisting replay buffer games to disk at {path}")
-            # 此处是将replay buffer的结果写入文件保持
-            pickle.dump(
-                {
-                    "buffer": self.replay_buffer,
-                    "num_played_games": self.checkpoint["num_played_games"],
-                    "num_played_steps": self.checkpoint["num_played_steps"],
-                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
-                },
-                open(path, "wb"),
-            )
-
-    def terminate_workers(self):
-        """
-        Softly terminate the running tasks and garbage collect the workers.
-        """
-        if self.shared_storage_worker:
-            self.shared_storage_worker.set_info.remote("terminate", True)
-            self.checkpoint = ray.get(
-                self.shared_storage_worker.get_checkpoint.remote()
-            )
-        if self.replay_buffer_worker:
-            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
-
-        print("\nShutting down workers...")
-
-        self.self_play_workers = None
-        self.test_worker = None
-        self.training_worker = None
-        self.reanalyse_worker = None
-        self.replay_buffer_worker = None
-        self.shared_storage_worker = None
-
-    def test(
-        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
-    ):
-        """
-        Test the model in a dedicated thread.
-
-        Args:
-            render (bool): To display or not the environment. Defaults to True.
-
-            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
-            for a random agent, None will use the opponent in the config. Defaults to None.
-
-            muzero_player (int): Player number of MuZero in case of multiplayer
-            games, None let MuZero play all players turn by turn, None will use muzero_player in
-            the config. Defaults to None.
-
-            num_tests (int): Number of games to average. Defaults to 1.
-
-            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
-        """
-        opponent = opponent if opponent else self.config.opponent
-        muzero_player = muzero_player if muzero_player else self.config.muzero_player
-        self_play_worker = self_play.SelfPlay.options(
-            num_cpus=0,
-            num_gpus=num_gpus,
-        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
-        results = []
-        for i in range(num_tests):
-            print(f"Testing {i+1}/{num_tests}")
-            results.append(
-                ray.get(
-                    self_play_worker.play_game.remote(
-                        0,
-                        0,
-                        render,
-                        opponent,
-                        muzero_player,
-                    )
-                )
-            )
-        self_play_worker.close_game.remote()
-
-        if len(self.config.players) == 1:
-            result = numpy.mean([sum(history.reward_history) for history in results])
-        else:
-            result = numpy.mean(
-                [
-                    sum(
-                        reward
-                        for i, reward in enumerate(history.reward_history)
-                        if history.to_play_history[i - 1] == muzero_player
-                    )
-                    for history in results
-                ]
-            )
-        return result
-
-    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
-        """
-        Load a model and/or a saved replay buffer.
-
-        Args:
-            checkpoint_path (str): Path to model.checkpoint or model.weights.
-
-            replay_buffer_path (str): Path to replay_buffer.pkl
-        """
-        # Load checkpoint
-        if checkpoint_path:
-            checkpoint_path = pathlib.Path(checkpoint_path)
-            self.checkpoint = torch.load(checkpoint_path)
-            print(f"\nUsing checkpoint from {checkpoint_path}")
-
-        # Load replay buffer
-        if replay_buffer_path:
-            replay_buffer_path = pathlib.Path(replay_buffer_path)
-            # pickle用来存储和导入文件，其作用是将对象转换为字符串或者将字符串转换为对象
-            with open(replay_buffer_path, "rb") as f:
-                replay_buffer_infos = pickle.load(f)
-            # 此处更新replay buffer的值
-            self.replay_buffer = replay_buffer_infos["buffer"]
-            self.checkpoint["num_played_steps"] = replay_buffer_infos[
-                "num_played_steps"
-            ]
-            self.checkpoint["num_played_games"] = replay_buffer_infos[
-                "num_played_games"
-            ]
-            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
-                "num_reanalysed_games"
-            ]
-
-            print(f"\nInitializing replay buffer with {replay_buffer_path}")
-        else:
-            print(f"Using empty buffer.")
-            self.replay_buffer = {}
-            self.checkpoint["training_step"] = 0
-            self.checkpoint["num_played_steps"] = 0
-            self.checkpoint["num_played_games"] = 0
-            self.checkpoint["num_reanalysed_games"] = 0
-
-    def diagnose_model(self, horizon):
-        """
-        Play a game only with the learned model then play the same trajectory in the real
-        environment and display information.
-
-        Args:
-            horizon (int): Number of timesteps for which we collect information.
-        """
-        game = self.Game(self.config.seed)
-        obs = game.reset()
-        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
-        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
-        input("Press enter to close all plots")
-        dm.close_all()
-
-
-# @ray.remote(num_cpus=0, num_gpus=0)
-# class CPUActor:
-#     # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
-#     def __init__(self):
-#         pass
-#
-#     def get_initial_weights(self, config):
-#         model = models.MuZeroNetwork(config)
-#         weigths = model.get_weights()
-#         summary = str(model).replace("\n", " \n\n")
-#         return weigths, summary
-
-
-def hyperparameter_search(
-    game_name, parametrization, budget, parallel_experiments, num_tests
-):
-    """
-    Search for hyperparameters by launching parallel experiments.
-
-    Args:
-        game_name (str): Name of the game module, it should match the name of a .py file
-        in the "./games" directory.
-
-        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
-
-        budget (int): Number of experiments to launch in total.
-
-        parallel_experiments (int): Number of experiments to launch in parallel.
-
-        num_tests (int): Number of games to average for evaluating an experiment.
-    """
-    optimizer = nevergrad.optimizers.OnePlusOne(
-        parametrization=parametrization, budget=budget
-    )
-
-    running_experiments = []
-    best_training = None
-    try:
-        # Launch initial experiments
-        for i in range(parallel_experiments):
-            if 0 < budget:
-                param = optimizer.ask()
-                print(f"Launching new experiment: {param.value}")
-                muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments)
-                muzero.param = param
-                muzero.train(False)
-                running_experiments.append(muzero)
-                budget -= 1
-
-        while 0 < budget or any(running_experiments):
-            for i, experiment in enumerate(running_experiments):
-                if experiment and experiment.config.training_steps <= ray.get(
-                    experiment.shared_storage_worker.get_info.remote("training_step")
-                ):
-                    experiment.terminate_workers()
-                    result = experiment.test(False, num_tests=num_tests)
-                    if not best_training or best_training["result"] < result:
-                        best_training = {
-                            "result": result,
-                            "config": experiment.config,
-                            "checkpoint": experiment.checkpoint,
-                        }
-                    print(f"Parameters: {experiment.param.value}")
-                    print(f"Result: {result}")
-                    optimizer.tell(experiment.param, -result)
-
-                    if 0 < budget:
-                        param = optimizer.ask()
-                        print(f"Launching new experiment: {param.value}")
-                        muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments)
-                        muzero.param = param
-                        muzero.train(False)
-                        running_experiments[i] = muzero
-                        budget -= 1
-                    else:
-                        running_experiments[i] = None
-
-    except KeyboardInterrupt:
-        for experiment in running_experiments:
-            if isinstance(experiment, MuZero_No_Replay_Buffer):
-                experiment.terminate_workers()
-
-    recommendation = optimizer.provide_recommendation()
-    print("Best hyperparameters:")
-    print(recommendation.value)
-    if best_training:
-        # Save best training weights (but it's not the recommended weights)
-        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
-        torch.save(
-            best_training["checkpoint"],
-            best_training["config"].results_path / "model.checkpoint",
-        )
-        # Save the recommended hyperparameters
-        text_file = open(
-            best_training["config"].results_path / "best_parameters.txt",
-            "w",
-        )
-        text_file.write(str(recommendation.value))
-        text_file.close()
-    return recommendation.value
-
-
-if __name__ == "__main__":
-    if len(sys.argv) == 2:
-        # Train directly with: python muzero.py cartpole
-        muzero = MuZero_No_Replay_Buffer(sys.argv[1])
-        muzero.train()
-    elif len(sys.argv) == 3:
-        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
-        config = json.loads(sys.argv[2])
-        muzero = MuZero_No_Replay_Buffer(sys.argv[1], config)
-        muzero.train()
-    else:
-        print("\nWelcome to MuZero! Here's a list of games:")
-        # Let user pick a game
-        games = [
-            filename.stem
-            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
-            if filename.name != "abstract_game.py"
-        ]
-        for i in range(len(games)):
-            print(f"{i}. {games[i]}")
-        choice = input("Enter a number to choose the game: ")
-        valid_inputs = [str(i) for i in range(len(games))]
-        while choice not in valid_inputs:
-            choice = input("Invalid input, enter a number listed above: ")
-
-        # Initialize MuZero
-        choice = int(choice)
-        game_name = games[choice]
-        muzero = MuZero_No_Replay_Buffer(game_name)
-
-        while True:
-            # Configure running options
-            options = [
-                "Train",
-                "Load pretrained model",
-                "Diagnose model",
-                "Render some self play games",
-                "Play against MuZero",
-                "Test the game manually",
-                "Hyperparameter search",
-                "Exit",
-            ]
-            print()
-            for i in range(len(options)):
-                print(f"{i}. {options[i]}")
-
-            choice = input("Enter a number to choose an action: ")
-            valid_inputs = [str(i) for i in range(len(options))]
-            while choice not in valid_inputs:
-                choice = input("Invalid input, enter a number listed above: ")
-            choice = int(choice)
-            if choice == 0:
-                start_time = time.time()
-                muzero.train()
-                end_time = time.time()
-                print("耗时: {:.2f}秒".format(end_time - start_time))
-            elif choice == 1:
-                load_model_menu(muzero, game_name)
-            elif choice == 2:
-                muzero.diagnose_model(30)
-            elif choice == 3:
-                muzero.test(render=True, opponent="self", muzero_player=None)
-            elif choice == 4:
-                muzero.test(render=True, opponent="human", muzero_player=0)
-            elif choice == 5:
-                env = muzero.Game()
-                env.reset()
-                env.render()
-
-                done = False
-                while not done:
-                    action = env.human_to_action()
-                    observation, reward, done = env.step(action)
-                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
-                    env.render()
-            elif choice == 6:
-                # Define here the parameters to tune
-                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
-                muzero.terminate_workers()
-                del muzero
-                budget = 20
-                parallel_experiments = 2
-                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
-                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
-                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
-                best_hyperparameters = hyperparameter_search(
-                    game_name, parametrization, budget, parallel_experiments, 20
-                )
-                muzero = MuZero_No_Replay_Buffer(game_name, best_hyperparameters)
-            else:
-                break
-            print("\nDone")
-
-    ray.shutdown()
diff --git a/game_tournament.py b/game_tournament.py
new file mode 100644
index 00000000..918beac3
--- /dev/null
+++ b/game_tournament.py
@@ -0,0 +1,221 @@
+import pickle
+
+import torch
+import copy
+import numpy
+
+from games.tictactoe import MuZeroConfig, Game
+import models
+from self_play import MCTS, GameHistory,SelfPlay
+from simplifiedMuZero.search_policy.self_play_uniform_search import UniformSearch
+
+class GameTournament:
+    def __init__(self, config:MuZeroConfig):
+        self.models = []
+        self.game = Game(config.seed)
+        self.config = config
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 0
+
+    def have_winner(self):
+        # Horizontal and vertical checks
+        for i in range(3):
+            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+
+        # Diagonal checks
+        if (
+            self.board[0, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[2, 2] == self.player
+        ):
+            return True
+        if (
+            self.board[2, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[0, 2] == self.player
+        ):
+            return True
+
+        return False
+
+    def play_competition(self, model1, search_policy1, model2, search_policy2):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model1.eval()
+        model2.eval()
+
+        is_model1 = True
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+            model = model1 if is_model1 else model2
+            search_policy = search_policy1 if is_model1 else search_policy2
+
+            root, mcts_info = search_policy(self.config).run(
+                model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                True,
+            )
+
+            action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model1 = not is_model1
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model1 == (reward > 0)
+
+    def close_game(self):
+        self.game.close()
+
+    def play_tournament(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
+                for _ in range(rollnum):
+                    have_winner, is_model1 = game_tournament.play_competition(model1, MCTS, model2, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model1_win_num += 1
+                        else:
+                            model2_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+                 # 交换顺序，再来一遍
+                for _ in range(rollnum):
+                    have_winner, is_model1 = game_tournament.play_competition(model2, MCTS, model1, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model2_win_num += 1
+                        else:
+                            model1_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+                # print(is_model1)
+
+                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
+
+                print(models[i]["name"], " win  :   ", model1_win_num)
+                print(models[j]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+
+
+def load_model(model_cls, model_path):
+    checkpoint = torch.load(model_path)
+    model = model_cls(config)
+    model.set_weights(checkpoint["weights"])
+
+    return model
+
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
+    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1)
+
+    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path)
+
+    uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
+    uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path)
+
+    without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
+    without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path)
+
+    game_tournament = GameTournament(config)
+
+    models = [
+        {"name":"muzero_2net", "model":muzero_2net_model},
+        {"name":"uniform", "model":uniform_model},
+        {"name":"muzero", "model":muzero_model},
+        {"name": "without_rb", "model": without_rb_model},
+    ]
+
+    # rollnum = 1000
+    #
+    # # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+    # model1_win_num = 0
+    # model2_win_num = 0
+    # no_winner_num = 0
+    #
+    # for i in range(rollnum):
+    #     have_winner, is_model1 = game_tournament.play_competition(muzero_2net_model, MCTS, uniform_model, MCTS)
+    #
+    #     if have_winner:
+    #         if is_model1:
+    #             model1_win_num += 1
+    #         else:
+    #             model2_win_num += 1
+    #     else:
+    #         no_winner_num += 1
+    #
+    # # print(is_model1)
+    #
+    # print(model1_win_num)
+    # print(model2_win_num)
+    # print(no_winner_num)
+
+    game_tournament.play_tournament(models, rollnum=100)
+
+    game_tournament.close_game()
+
+
+    # print(checkpoint)
diff --git a/games/tictactoe.py b/games/tictactoe.py
index f331a9ae..c2529d5d 100644
--- a/games/tictactoe.py
+++ b/games/tictactoe.py
@@ -75,7 +75,8 @@ def __init__(self):
         ### Training
         self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
         self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
-        self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        # self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        self.training_steps = 50000
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
diff --git a/muzero_2net.py b/muzero_2net.py
index d03457ec..39438acd 100644
--- a/muzero_2net.py
+++ b/muzero_2net.py
@@ -23,7 +23,7 @@
 import simplifiedMuZero.net2.trainer_2net as trainer
 
 
-class MuZero:
+class MuZero_2Net:
     """
     Main class to manage MuZero.
 
@@ -36,7 +36,7 @@ class MuZero:
         split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
 
     Example:
-        >>> muzero = MuZero("cartpole")
+        >>> muzero = MuZero_2Net("cartpole")
         >>> muzero.train()
         >>> muzero.test(render=True)
     """
@@ -67,6 +67,8 @@ def __init__(self, game_name, config=None, split_resources_in=1):
             else:
                 self.config = config
 
+        # 重命名路径，以便区分不同的模型
+        self.config.results_path /= "muzero_2net"
         # Fix random generator seed
         numpy.random.seed(self.config.seed)
         torch.manual_seed(self.config.seed)
@@ -525,7 +527,7 @@ def hyperparameter_search(
             if 0 < budget:
                 param = optimizer.ask()
                 print(f"Launching new experiment: {param.value}")
-                muzero = MuZero(game_name, param.value, parallel_experiments)
+                muzero = MuZero_2Net(game_name, param.value, parallel_experiments)
                 muzero.param = param
                 muzero.train(False)
                 running_experiments.append(muzero)
@@ -551,7 +553,7 @@ def hyperparameter_search(
                     if 0 < budget:
                         param = optimizer.ask()
                         print(f"Launching new experiment: {param.value}")
-                        muzero = MuZero(game_name, param.value, parallel_experiments)
+                        muzero = MuZero_2Net(game_name, param.value, parallel_experiments)
                         muzero.param = param
                         muzero.train(False)
                         running_experiments[i] = muzero
@@ -561,7 +563,7 @@ def hyperparameter_search(
 
     except KeyboardInterrupt:
         for experiment in running_experiments:
-            if isinstance(experiment, MuZero):
+            if isinstance(experiment, MuZero_2Net):
                 experiment.terminate_workers()
 
     recommendation = optimizer.provide_recommendation()
@@ -625,12 +627,12 @@ def load_model_menu(muzero, game_name):
 if __name__ == "__main__":
     if len(sys.argv) == 2:
         # Train directly with: python muzero.py cartpole
-        muzero = MuZero(sys.argv[1])
+        muzero = MuZero_2Net(sys.argv[1])
         muzero.train()
     elif len(sys.argv) == 3:
         # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
         config = json.loads(sys.argv[2])
-        muzero = MuZero(sys.argv[1], config)
+        muzero = MuZero_2Net(sys.argv[1], config)
         muzero.train()
     else:
         print("\nWelcome to MuZero! Here's a list of games:")
@@ -650,7 +652,7 @@ def load_model_menu(muzero, game_name):
         # Initialize MuZero
         choice = int(choice)
         game_name = games[choice]
-        muzero = MuZero(game_name)
+        muzero = MuZero_2Net(game_name)
 
         while True:
             # Configure running options
@@ -710,7 +712,7 @@ def load_model_menu(muzero, game_name):
                 best_hyperparameters = hyperparameter_search(
                     game_name, parametrization, budget, parallel_experiments, 20
                 )
-                muzero = MuZero(game_name, best_hyperparameters)
+                muzero = MuZero_2Net(game_name, best_hyperparameters)
             else:
                 break
             print("\nDone")
diff --git a/muzero_uniform.py b/muzero_uniform.py
new file mode 100644
index 00000000..24a9e09b
--- /dev/null
+++ b/muzero_uniform.py
@@ -0,0 +1,719 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+import models
+import replay_buffer
+import simplifiedMuZero.search_policy.self_play_uniform_search as self_play
+import shared_storage
+import trainer
+
+
+class MuZero_uniform:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero_uniform("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # 重命名路径，以便区分不同的模型
+        self.config.results_path /= "muzero_uniform"
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlay.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.MuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero_uniform(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero_uniform(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero_uniform):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero_uniform(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero_uniform(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero_uniform(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero_uniform(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py
index e0a63690..2eba36a0 100644
--- a/muzero_without_replay_buffer.py
+++ b/muzero_without_replay_buffer.py
@@ -1,723 +1,870 @@
-import copy
-import importlib
-import json
-import math
-import pathlib
-import pickle
-import sys
-import time
+from self_play import MCTS, GameHistory
+from games.simple_grid import MuZeroConfig, Game
+# from games.tictactoe import MuZeroConfig, Game
+import models
 
-import nevergrad
 import numpy
-import ray
 import torch
 from torch.utils.tensorboard import SummaryWriter
+import pickle
 
-import diagnose_model
-import simplifiedMuZero.without_rb.models_without_replay_buffer as models
-# import replay_buffer
-import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play
-import shared_storage
-import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer
-
+import math
+import time
+import copy
 
-class MuZero_Without_Replay_Buffer:
+class GamePlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
     """
-    Main class to manage MuZero.
 
-    Args:
-        game_name (str): Name of the game module, it should match the name of a .py file
-        in the "./games" directory.
+    def __init__(self, model, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
 
-        config (dict, MuZeroConfig, optional): Override the default config of the game.
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        # self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(initial_checkpoint["weights"])
+        self.model = model
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+        self.trained_steps = initial_checkpoint["training_step"]
+        self.terminate = False
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+        game_id = None
+
+        if render:
+            self.game.render()
+
+        game_id = self.game.to_play()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
 
-        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+                observation, reward, done = self.game.step(action) # 运行游戏
 
-    Example:
-        >>> muzero = MuZero_Without_Replay_Buffer("cartpole")
-        >>> muzero.train()
-        >>> muzero.test(render=True)
-    """
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
 
-    def __init__(self, game_name, config=None, split_resources_in=1):
-        # Load the game and the config from the module with the game name
-        try:
-            game_module = importlib.import_module("games." + game_name)
-            print("games." + game_name)
-            self.Game = game_module.Game
-            self.config = game_module.MuZeroConfig()
-        except ModuleNotFoundError as err:
-            print(
-                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
-            )
-            raise err
-
-        # Overwrite the config
-        if config:
-            if type(config) is dict:
-                for param, value in config.items():
-                    if hasattr(self.config, param):
-                        setattr(self.config, param, value)
-                    else:
-                        raise AttributeError(
-                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
-                        )
-            else:
-                self.config = config
+                game_history.store_search_statistics(root, self.config.action_space)
 
-        # Fix random generator seed
-        numpy.random.seed(self.config.seed)
-        torch.manual_seed(self.config.seed)
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
 
-        # Manage GPUs
-        if self.config.max_num_gpus == 0 and (
-            self.config.selfplay_on_gpu
-            or self.config.train_on_gpu
-            or self.config.reanalyse_on_gpu
-        ):
-            raise ValueError(
-                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+        return game_id, game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
             )
-        if (
-            self.config.selfplay_on_gpu
-            or self.config.train_on_gpu
-            or self.config.reanalyse_on_gpu
-        ):
-            total_gpus = (
-                self.config.max_num_gpus
-                if self.config.max_num_gpus is not None
-                else torch.cuda.device_count()
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
             )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
         else:
-            total_gpus = 0
-        self.num_gpus = total_gpus / split_resources_in
-        if 1 < self.num_gpus:
-            self.num_gpus = math.floor(self.num_gpus)
-
-        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
-
-        # Checkpoint and replay buffer used to initialize workers
-        self.checkpoint = {
-            "weights": None,
-            "optimizer_state": None,
-            "total_reward": 0,
-            "muzero_reward": 0,
-            "opponent_reward": 0,
-            "episode_length": 0,
-            "mean_value": 0,
-            "training_step": 0,
-            "lr": 0,
-            "total_loss": 0,
-            "value_loss": 0,
-            "reward_loss": 0,
-            "policy_loss": 0,
-            "num_played_games": 0,
-            "num_played_steps": 0,
-            "num_reanalysed_games": 0,
-            "terminate": False,
-        }
-        self.replay_buffer = {}
-
-        cpu_actor = CPUActor.remote()
-        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
-        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
-
-        # Workers
-        self.self_play_workers = None
-        self.test_worker = None
-        self.training_worker = None
-        self.reanalyse_worker = None
-        self.replay_buffer_worker = None
-        self.shared_storage_worker = None
-
-    def train(self, log_in_tensorboard=True):
-        """
-        Spawn ray workers and launch the training.
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
 
-        Args:
-            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
         """
-        if log_in_tensorboard or self.config.save_model:
-            self.config.results_path.mkdir(parents=True, exist_ok=True)
-
-        # Manage GPUs
-        if 0 < self.num_gpus:
-            num_gpus_per_worker = self.num_gpus / (
-                self.config.train_on_gpu
-                + self.config.num_workers * self.config.selfplay_on_gpu
-                + log_in_tensorboard * self.config.selfplay_on_gpu
-                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
-            )
-            if 1 < num_gpus_per_worker:
-                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
         else:
-            num_gpus_per_worker = 0
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
 
-        # Initialize workers
-        self.training_worker = trainer.Trainer.options(
-            num_cpus=0,
-            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
-        ).remote(self.checkpoint, self.config)
+        return action
 
-        self.shared_storage_worker = shared_storage.SharedStorage.remote(
-            self.checkpoint,
-            self.config,
-        )
-        self.shared_storage_worker.set_info.remote("terminate", False)
+class PlayBuffer:
+    """
+    Class which run in a dedicated thread to store played games and generate batch.
+    """
 
-        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
-            self.checkpoint, self.replay_buffer, self.config
+    def __init__(self, initial_checkpoint, initial_buffer, config):
+        self.config = config
+        self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{}
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
         )
+        if self.total_samples != 0:
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
 
-        #使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
-        if self.config.use_last_model_value:
-            self.reanalyse_worker = replay_buffer.Reanalyse.options(
-                num_cpus=0,
-                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
-            ).remote(self.checkpoint, self.config)
-
-        self.self_play_workers = [
-            self_play.SelfPlay.options(
-                num_cpus=0,
-                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
-            ).remote(
-                self.checkpoint,
-                self.Game,
-                self.config,
-                self.config.seed + seed,
-            )
-            for seed in range(self.config.num_workers)
-        ]
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
 
-        # 这里调用continuous类的函数，主要是continuous函数会调用replay_buffer，
+    def save_game(self, game_history):
+        self.buffer[self.num_played_games] = game_history
+        self.num_played_games += 1
+        self.num_played_steps += len(game_history.root_values)
+        self.total_samples += len(game_history.root_values)
+
+        if self.config.replay_buffer_size < len(self.buffer):
+            del_id = self.num_played_games - len(self.buffer)
+            self.total_samples -= len(self.buffer[del_id].root_values)
+            del self.buffer[del_id]
+
+    def get_buffer(self):
+        return self.buffer
+
+    def get_batch(self):
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [])
+        weight_batch = None
+
+        for game_id, game_history, game_prob in self.sample_n_games(
+            self.config.batch_size
+        ):
+            game_pos, pos_prob = self.sample_position(game_history)
 
-        # Launch workers
-        # 此处调用worker进行self play，把结果存在replay_buffer里
-        [
-            self_play_worker.continuous_self_play.remote(
-                self.shared_storage_worker, self.replay_buffer_worker
+            values, rewards, policies, actions = self.make_target(
+                game_history, game_pos
             )
-            for self_play_worker in self.self_play_workers
-        ]
 
-        # 此处使用trainer，从replay buffer里按batch抽取数据，进行网络训练和更新
-        self.training_worker.continuous_update_weights.remote(
-            self.replay_buffer_worker, self.shared_storage_worker
-        )
-        # 使用最后一个模型提供更新鲜、稳定的n步值（参见论文附录Reanalyze）
-        if self.config.use_last_model_value:
-            self.reanalyse_worker.reanalyse.remote(
-                self.replay_buffer_worker, self.shared_storage_worker
+            index_batch.append([game_id, game_pos])
+            observation_batch.append(
+                game_history.get_stacked_observations(
+                    game_pos,
+                    self.config.stacked_observations,
+                    len(self.config.action_space),
+                )
             )
-
-        if log_in_tensorboard:
-            self.logging_loop(
-                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            action_batch.append(actions)
+            value_batch.append(values)
+            reward_batch.append(rewards)
+            policy_batch.append(policies)
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
             )
 
-    def logging_loop(self, num_gpus):
-        """
-        Keep track of the training performance.
-        """
-        # Launch the test worker to get performance metrics
-        self.test_worker = self_play.SelfPlay.options(
-            num_cpus=0,
-            num_gpus=num_gpus,
-        ).remote(
-            self.checkpoint,
-            self.Game,
-            self.config,
-            self.config.seed + self.config.num_workers,
-        )
-        self.test_worker.continuous_self_play.remote(
-            self.shared_storage_worker, None, True
-        )
-
-        # Write everything in TensorBoard
-        writer = SummaryWriter(self.config.results_path)
-
-        print(
-            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1
+        # value_batch: batch, num_unroll_steps+1
+        # reward_batch: batch, num_unroll_steps+1
+        # policy_batch: batch, num_unroll_steps+1, len(action_space)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
         )
 
-        # Save hyperparameters to TensorBoard
-        hp_table = [
-            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
-        ]
-        writer.add_text(
-            "Hyperparameters",
-            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
-        )
-        # Save model representation
-        writer.add_text(
-            "Model summary",
-            self.summary,
-        )
-        # Loop for updating the training performance
-        counter = 0
-        keys = [
-            "total_reward",
-            "muzero_reward",
-            "opponent_reward",
-            "episode_length",
-            "mean_value",
-            "training_step",
-            "lr",
-            "total_loss",
-            "value_loss",
-            "reward_loss",
-            "policy_loss",
-            "num_played_games",
-            "num_played_steps",
-            "num_reanalysed_games",
-        ]
-        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
-        try:
-            while info["training_step"] < self.config.training_steps:
-                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
-                writer.add_scalar(
-                    "1.Total_reward/1.Total_reward",
-                    info["total_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/2.Mean_value",
-                    info["mean_value"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/3.Episode_length",
-                    info["episode_length"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/4.MuZero_reward",
-                    info["muzero_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/5.Opponent_reward",
-                    info["opponent_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/1.Self_played_games",
-                    info["num_played_games"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/2.Training_steps", info["training_step"], counter
-                )
-                writer.add_scalar(
-                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
-                )
-                writer.add_scalar(
-                    "2.Workers/4.Reanalysed_games",
-                    info["num_reanalysed_games"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
-                    info["training_step"] / max(1, info["num_played_steps"]),
-                    counter,
-                )
-                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
-                writer.add_scalar(
-                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
-                )
-                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
-                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
-                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
-                print(
-                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
-                    end="\r",
-                )
-                counter += 1
-                time.sleep(0.5)
-        except KeyboardInterrupt:
-            pass
-
-        self.terminate_workers()
-
-        if self.config.save_model:
-            # Persist replay buffer to disk
-            path = self.config.results_path / "replay_buffer.pkl"
-            print(f"\n\nPersisting replay buffer games to disk at {path}")
-            pickle.dump(
-                {
-                    "buffer": self.replay_buffer,
-                    "num_played_games": self.checkpoint["num_played_games"],
-                    "num_played_steps": self.checkpoint["num_played_steps"],
-                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
-                },
-                open(path, "wb"),
-            )
-
-    def terminate_workers(self):
+    def sample_game(self, force_uniform=True): #将force_uniform 设置为True，强制安装平均分布选取
         """
-        Softly terminate the running tasks and garbage collect the workers.
+        Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
         """
-        if self.shared_storage_worker:
-            self.shared_storage_worker.set_info.remote("terminate", True)
-            self.checkpoint = ray.get(
-                self.shared_storage_worker.get_checkpoint.remote()
-            )
-        if self.replay_buffer_worker:
-            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+        game_prob = None
 
-        print("\nShutting down workers...")
+        game_index = numpy.random.choice(len(self.buffer))
+        game_id = self.num_played_games - len(self.buffer) + game_index
 
-        self.self_play_workers = None
-        self.test_worker = None
-        self.training_worker = None
-        self.reanalyse_worker = None
-        self.replay_buffer_worker = None
-        self.shared_storage_worker = None
+        return game_id, self.buffer[game_id], game_prob
 
-    def test(
-        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
-    ):
+    def sample_n_games(self, n_games):
+        selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+        game_prob_dict = {}
+        ret = [
+            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+            for game_id in selected_games
+        ]
+        return ret
+
+    def sample_position(self, game_history):
         """
-        Test the model in a dedicated thread.
+        Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        position_prob = None
 
-        Args:
-            render (bool): To display or not the environment. Defaults to True.
+        position_index = numpy.random.choice(len(game_history.root_values))
 
-            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
-            for a random agent, None will use the opponent in the config. Defaults to None.
+        return position_index, position_prob
 
-            muzero_player (int): Player number of MuZero in case of multiplayer
-            games, None let MuZero play all players turn by turn, None will use muzero_player in
-            the config. Defaults to None.
+    def update_game_history(self, game_id, game_history):
+        # The element could have been removed since its selection and update
+        # if next(iter(self.buffer)) <= game_id:
+        #     self.buffer[game_id] = game_history
 
-            num_tests (int): Number of games to average. Defaults to 1.
+        self.buffer[game_id] = game_history
 
-            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
-        """
-        opponent = opponent if opponent else self.config.opponent
-        muzero_player = muzero_player if muzero_player else self.config.muzero_player
-        self_play_worker = self_play.SelfPlay.options(
-            num_cpus=0,
-            num_gpus=num_gpus,
-        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
-        results = []
-        for i in range(num_tests):
-            print(f"Testing {i+1}/{num_tests}")
-            results.append(
-                ray.get(
-                    self_play_worker.play_game.remote(
-                        0,
-                        0,
-                        render,
-                        opponent,
-                        muzero_player,
-                    )
-                )
+    def compute_target_value(self, game_history, index):
+        # The value target is the discounted root value of the search tree td_steps into the
+        # future, plus the discounted sum of all rewards until then.
+        bootstrap_index = index + self.config.td_steps
+        if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            last_step_value = (
+                root_values[bootstrap_index]
+                if game_history.to_play_history[bootstrap_index]
+                == game_history.to_play_history[index]
+                else -root_values[bootstrap_index]
             )
-        self_play_worker.close_game.remote()
 
-        if len(self.config.players) == 1:
-            result = numpy.mean([sum(history.reward_history) for history in results])
+            value = last_step_value * self.config.discount**self.config.td_steps
         else:
-            result = numpy.mean(
-                [
-                    sum(
-                        reward
-                        for i, reward in enumerate(history.reward_history)
-                        if history.to_play_history[i - 1] == muzero_player
-                    )
-                    for history in results
-                ]
-            )
-        return result
+            value = 0
 
-    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
-        """
-        Load a model and/or a saved replay buffer.
+        for i, reward in enumerate(
+            game_history.reward_history[index + 1 : bootstrap_index + 1]
+        ):
+            # The value is oriented from the perspective of the current player
+            value += (
+                reward
+                if game_history.to_play_history[index]
+                == game_history.to_play_history[index + i]
+                else -reward
+            ) * self.config.discount**i
 
-        Args:
-            checkpoint_path (str): Path to model.checkpoint or model.weights.
+        return value
 
-            replay_buffer_path (str): Path to replay_buffer.pkl
+    def make_target(self, game_history, state_index):
         """
-        # Load checkpoint
-        if checkpoint_path:
-            checkpoint_path = pathlib.Path(checkpoint_path)
-            self.checkpoint = torch.load(checkpoint_path)
-            print(f"\nUsing checkpoint from {checkpoint_path}")
-
-        # Load replay buffer
-        if replay_buffer_path:
-            replay_buffer_path = pathlib.Path(replay_buffer_path)
-            with open(replay_buffer_path, "rb") as f:
-                replay_buffer_infos = pickle.load(f)
-            self.replay_buffer = replay_buffer_infos["buffer"]
-            self.checkpoint["num_played_steps"] = replay_buffer_infos[
-                "num_played_steps"
-            ]
-            self.checkpoint["num_played_games"] = replay_buffer_infos[
-                "num_played_games"
-            ]
-            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
-                "num_reanalysed_games"
-            ]
-
-            print(f"\nInitializing replay buffer with {replay_buffer_path}")
-        else:
-            print(f"Using empty buffer.")
-            self.replay_buffer = {}
-            self.checkpoint["training_step"] = 0
-            self.checkpoint["num_played_steps"] = 0
-            self.checkpoint["num_played_games"] = 0
-            self.checkpoint["num_reanalysed_games"] = 0
-
-    def diagnose_model(self, horizon):
+        Generate targets for every unroll steps.
         """
-        Play a game only with the learned model then play the same trajectory in the real
-        environment and display information.
+        target_values, target_rewards, target_policies, actions = [], [], [], []
+        for current_index in range(
+            state_index, state_index + self.config.num_unroll_steps + 1
+        ):
+            value = self.compute_target_value(game_history, current_index)
+
+            if current_index < len(game_history.root_values):
+                target_values.append(value)
+                target_rewards.append(game_history.reward_history[current_index])
+                target_policies.append(game_history.child_visits[current_index])
+                actions.append(game_history.action_history[current_index])
+            elif current_index == len(game_history.root_values):
+                target_values.append(0)
+                target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(game_history.action_history[current_index])
+            else:
+                # States past the end of games are treated as absorbing states
+                target_values.append(0)
+                target_rewards.append(0)
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(numpy.random.choice(self.config.action_space))
 
-        Args:
-            horizon (int): Number of timesteps for which we collect information.
-        """
-        game = self.Game(self.config.seed)
-        obs = game.reset()
-        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
-        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
-        input("Press enter to close all plots")
-        dm.close_all()
-
-
-@ray.remote(num_cpus=0, num_gpus=0)
-class CPUActor:
-    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
-    def __init__(self):
-        pass
+        return target_values, target_rewards, target_policies, actions
 
-    def get_initial_weights(self, config):
-        model = models.MuZeroNetwork(config)
-        weigths = model.get_weights()
-        summary = str(model).replace("\n", " \n\n")
-        return weigths, summary
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
 
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
 
-def hyperparameter_search(
-    game_name, parametrization, budget, parallel_experiments, num_tests
-):
-    """
-    Search for hyperparameters by launching parallel experiments.
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
 
-    Args:
-        game_name (str): Name of the game module, it should match the name of a .py file
-        in the "./games" directory.
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
 
-        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+        self.training_step = initial_checkpoint["training_step"]
 
-        budget (int): Number of experiments to launch in total.
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
 
-        parallel_experiments (int): Number of experiments to launch in parallel.
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
 
-        num_tests (int): Number of games to average for evaluating an experiment.
-    """
-    optimizer = nevergrad.optimizers.OnePlusOne(
-        parametrization=parametrization, budget=budget
-    )
+        # if initial_checkpoint["optimizer_state"] is not None:
+        #     print("Loading optimizer...\n")
+        #     self.optimizer.load_state_dict(
+        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
+        #     )
+
+    # # update weights 与 continuous update weights 的区别
+    # #   1. update weights 是实际计算更新network的权重
+    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+    #     next_batch = play_buffer.get_batch()
+    #     # Training loop
+    #     while self.training_step < self.config.training_steps and not terminate:
+    #         index_batch, batch = next_batch
+    #         next_batch = play_buffer.get_batch()
+    #         self.update_lr()
+    #         (
+    #             priorities,
+    #             total_loss,
+    #             value_loss,
+    #             reward_loss,
+    #             policy_loss,
+    #         ) = self.update_weights(batch)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
 
-    running_experiments = []
-    best_training = None
-    try:
-        # Launch initial experiments
-        for i in range(parallel_experiments):
-            if 0 < budget:
-                param = optimizer.ask()
-                print(f"Launching new experiment: {param.value}")
-                muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments)
-                muzero.param = param
-                muzero.train(False)
-                running_experiments.append(muzero)
-                budget -= 1
-
-        while 0 < budget or any(running_experiments):
-            for i, experiment in enumerate(running_experiments):
-                if experiment and experiment.config.training_steps <= ray.get(
-                    experiment.shared_storage_worker.get_info.remote("training_step")
-                ):
-                    experiment.terminate_workers()
-                    result = experiment.test(False, num_tests=num_tests)
-                    if not best_training or best_training["result"] < result:
-                        best_training = {
-                            "result": result,
-                            "config": experiment.config,
-                            "checkpoint": experiment.checkpoint,
-                        }
-                    print(f"Parameters: {experiment.param.value}")
-                    print(f"Result: {result}")
-                    optimizer.tell(experiment.param, -result)
-
-                    if 0 < budget:
-                        param = optimizer.ask()
-                        print(f"Launching new experiment: {param.value}")
-                        muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments)
-                        muzero.param = param
-                        muzero.train(False)
-                        running_experiments[i] = muzero
-                        budget -= 1
-                    else:
-                        running_experiments[i] = None
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
 
-    except KeyboardInterrupt:
-        for experiment in running_experiments:
-            if isinstance(experiment, MuZero_Without_Replay_Buffer):
-                experiment.terminate_workers()
-
-    recommendation = optimizer.provide_recommendation()
-    print("Best hyperparameters:")
-    print(recommendation.value)
-    if best_training:
-        # Save best training weights (but it's not the recommended weights)
-        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
-        torch.save(
-            best_training["checkpoint"],
-            best_training["config"].results_path / "model.checkpoint",
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
         )
-        # Save the recommended hyperparameters
-        text_file = open(
-            best_training["config"].results_path / "best_parameters.txt",
-            "w",
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
         )
-        text_file.write(str(recommendation.value))
-        text_file.close()
-    return recommendation.value
 
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
 
-def load_model_menu(muzero, game_name):
-    # Configure running options
-    options = ["Specify paths manually"] + sorted(
-        (pathlib.Path("results") / game_name).glob("*/")
-    )
-    options.reverse()
-    print()
-    for i in range(len(options)):
-        print(f"{i}. {options[i]}")
-
-    choice = input("Enter a number to choose a model to load: ")
-    valid_inputs = [str(i) for i in range(len(options))]
-    while choice not in valid_inputs:
-        choice = input("Invalid input, enter a number listed above: ")
-    choice = int(choice)
-
-    if choice == (len(options) - 1):
-        # manual path option
-        checkpoint_path = input(
-            "Enter a path to the model.checkpoint, or ENTER if none: "
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
         )
-        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
-            checkpoint_path = input("Invalid checkpoint path. Try again: ")
-        replay_buffer_path = input(
-            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
         )
-        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
-            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
-    else:
-        checkpoint_path = options[choice] / "model.checkpoint"
-        replay_buffer_path = options[choice] / "replay_buffer.pkl"
-
-    muzero.load_model(
-        checkpoint_path=checkpoint_path,
-        replay_buffer_path=replay_buffer_path,
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1)
+
+        return value_loss, reward_loss, policy_loss
+
+
+def logging_loop(config, checkpoint, writer, training_steps):
+    # writer = SummaryWriter(config.results_path)
+
+    # print(
+    #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+    # )
+
+    # Save hyperparameters to TensorBoard
+    hp_table = [
+        f"| {key} | {value} |" for key, value in config.__dict__.items()
+    ]
+    writer.add_text(
+        "Hyperparameters",
+        "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
     )
+    # # Save model representation
+    # writer.add_text(
+    #     "Model summary",
+    #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
+    # )
+    # Loop for updating the training performance
+    counter = training_steps
 
+    try:
+        if True:
+        # while checkpoint["training_step"] < config.training_steps:
+            writer.add_scalar(
+                "1.Total_reward/1.Total_reward",
+                checkpoint["total_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/2.Mean_value",
+                checkpoint["mean_value"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/3.Episode_length",
+                checkpoint["episode_length"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/4.MuZero_reward",
+                checkpoint["muzero_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/5.Opponent_reward",
+                checkpoint["opponent_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/1.Self_played_games",
+                checkpoint["num_played_games"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/2.Training_steps", checkpoint["training_step"], counter
+            )
+            writer.add_scalar(
+                "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter
+            )
+            writer.add_scalar(
+                "2.Workers/4.Reanalysed_games",
+                checkpoint["num_reanalysed_games"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]),
+                counter,
+            )
+            writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter)
+            writer.add_scalar(
+                "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter
+            )
+            writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter)
+            writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter)
+            writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter)
+            print(
+                f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}',
+                end="\r",
+            )
+            counter += 1
+            # time.sleep(0.5)
+    except KeyboardInterrupt:
+        pass
 
-if __name__ == "__main__":
-    if len(sys.argv) == 2:
-        # Train directly with: python muzero.py cartpole
-        muzero = MuZero_Without_Replay_Buffer(sys.argv[1])
-        muzero.train()
-    elif len(sys.argv) == 3:
-        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
-        config = json.loads(sys.argv[2])
-        muzero = MuZero_Without_Replay_Buffer(sys.argv[1], config)
-        muzero.train()
-    else:
-        print("\nWelcome to MuZero! Here's a list of games:")
-        # Let user pick a game
-        games = [
-            filename.stem
-            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
-            if filename.name != "abstract_game.py"
-        ]
-        for i in range(len(games)):
-            print(f"{i}. {games[i]}")
-        choice = input("Enter a number to choose the game: ")
-        valid_inputs = [str(i) for i in range(len(games))]
-        while choice not in valid_inputs:
-            choice = input("Invalid input, enter a number listed above: ")
-
-        # Initialize MuZero
-        choice = int(choice)
-        game_name = games[choice]
-        muzero = MuZero_Without_Replay_Buffer(game_name)
-
-        while True:
-            # Configure running options
-            options = [
-                "Train",
-                "Load pretrained model",
-                "Diagnose model",
-                "Render some self play games",
-                "Play against MuZero",
-                "Test the game manually",
-                "Hyperparameter search",
-                "Exit",
-            ]
-            print()
-            for i in range(len(options)):
-                print(f"{i}. {options[i]}")
-
-            choice = input("Enter a number to choose an action: ")
-            valid_inputs = [str(i) for i in range(len(options))]
-            while choice not in valid_inputs:
-                choice = input("Invalid input, enter a number listed above: ")
-            choice = int(choice)
-            if choice == 0:
-                start_time = time.time()
-                muzero.train()
-                end_time = time.time()
-                print("耗时: {:.2f}秒".format(end_time - start_time))
-            elif choice == 1:
-                load_model_menu(muzero, game_name)
-            elif choice == 2:
-                muzero.diagnose_model(30)
-            elif choice == 3:
-                muzero.test(render=True, opponent="self", muzero_player=None)
-            elif choice == 4:
-                muzero.test(render=True, opponent="human", muzero_player=0)
-            elif choice == 5:
-                env = muzero.Game()
-                env.reset()
-                env.render()
-
-                done = False
-                while not done:
-                    action = env.human_to_action()
-                    observation, reward, done = env.step(action)
-                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
-                    env.render()
-            elif choice == 6:
-                # Define here the parameters to tune
-                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
-                muzero.terminate_workers()
-                del muzero
-                budget = 20
-                parallel_experiments = 2
-                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
-                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
-                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
-                best_hyperparameters = hyperparameter_search(
-                    game_name, parametrization, budget, parallel_experiments, 20
+    # if config.save_model:
+    #     # Persist replay buffer to disk
+    #     path = config.results_path / "replay_buffer.pkl"
+    #     print(f"\n\nPersisting replay buffer games to disk at {path}")
+    #     pickle.dump(
+    #         {
+    #             "buffer": buffer,
+    #             "num_played_games": checkpoint["num_played_games"],
+    #             "num_played_steps": checkpoint["num_played_steps"],
+    #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
+    #         },
+    #         open(path, "wb"),
+    #     )
+
+def update_gameplay_checkpoint(config, checkpoint, game_history):
+    checkpoint["episode_length"] = len(game_history.action_history) - 1
+    checkpoint["total_reward"] = sum(game_history.reward_history)
+    checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
+
+    if 1 < len(config.players):
+        checkpoint["muzero_reward"] = sum(
+                    reward
+                    for i, reward in enumerate(game_history.reward_history)
+                    if game_history.to_play_history[i - 1]
+                    == config.muzero_player
+                )
+        checkpoint["opponent_reward"] = sum(
+                    reward
+                    for i, reward in enumerate(game_history.reward_history)
+                    if game_history.to_play_history[i - 1]
+                    != config.muzero_player
                 )
-                muzero = MuZero_Without_Replay_Buffer(game_name, best_hyperparameters)
-            else:
-                break
-            print("\nDone")
 
-    ray.shutdown()
+def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中
+    if not path:
+        path = config.results_path / "model.checkpoint"
+
+    torch.save(checkpoint, path)
+
+def train(log_in_tensorboard=True):
+    config = MuZeroConfig()
+    config.results_path /= "muzero_without_rb"
+
+    if log_in_tensorboard or config.save_model:
+        config.results_path.mkdir(parents=True, exist_ok=True)
+
+    checkpoint = {
+        "weights": None,
+        "optimizer_state": None,
+        "total_reward": 0,
+        "muzero_reward": 0,
+        "opponent_reward": 0,
+        "episode_length": 0,
+        "mean_value": 0,
+        "training_step": 0,
+        "lr": 0,
+        "total_loss": 0,
+        "value_loss": 0,
+        "reward_loss": 0,
+        "policy_loss": 0,
+        "num_played_games": 0,
+        "num_played_steps": 0,
+        "num_reanalysed_games": 0,
+        "terminate": False,
+    }
+
+    trainer = Trainer(checkpoint, config)
+    selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed)
+    buffer = {}
+    play_buffer = PlayBuffer(checkpoint, buffer, config)
+
+    step = 1 # 间隔，即每次模拟后训练多少次
+    max_steps = int(config.training_steps/step)
+
+    writer = SummaryWriter(config.results_path)
+
+    for episode in range(max_steps):
+        game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0)
+
+        # print(game_id)
+        # print(game_history.action_history)
+        # print(game_history.reward_history)
+        # print(game_history.to_play_history)
+        # # print(game_history.observation_history)
+        # print("child visits", game_history.child_visits)
+        # print(game_history.root_values) # root value指的是root节点的UCB值
+
+        play_buffer.update_game_history(game_id, game_history)
+        update_gameplay_checkpoint(config, checkpoint, game_history)
+
+        for i in range(step):
+            index_batch, batch = play_buffer.get_batch()
+            # print(batch[1])
+            trainer.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = trainer.update_weights(batch)
+
+
+            training_step = episode * step + i
+            if training_step % config.checkpoint_interval == 0:
+                checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
+                checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
+
+                if config.save_model:
+                    save_checkpoint(config, checkpoint)
+            checkpoint["training_step"] = training_step
+            checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
+            checkpoint["total_loss"] = total_loss
+            checkpoint["value_loss"] = value_loss
+            checkpoint["reward_loss"] = reward_loss
+            checkpoint["policy_loss"] = policy_loss
+
+        # print(training_step)
+        # if training_step % 500 == 0:
+        # if training_step % config.checkpoint_interval == 0:
+        #     # print(training_step)
+        #     logging_loop(config, checkpoint, writer)
+
+        logging_loop(config, checkpoint, writer, training_step)
+
+
+    writer.close()
+
+    selfplay.close_game()
+
+if __name__ == "__main__":
+    start_time = time.time()
+    train()
+    end_time = time.time()
+    print("耗时: {:.2f}秒".format(end_time - start_time))
\ No newline at end of file
diff --git a/muzero_without_replay_buffer2.py b/muzero_without_replay_buffer2.py
new file mode 100644
index 00000000..ebbb147f
--- /dev/null
+++ b/muzero_without_replay_buffer2.py
@@ -0,0 +1,417 @@
+import pathlib
+import importlib
+import ray
+
+import numpy
+import torch
+from torch.utils.tensorboard import SummaryWriter
+import pickle
+
+import math
+import time
+import copy
+import nevergrad
+import sys
+import json
+
+from simplifiedMuZero.without_rb.game_play import GamePlay
+from simplifiedMuZero.without_rb.play_buffer import PlayBuffer
+from simplifiedMuZero.without_rb.trainer import Trainer
+from muzero import load_model_menu, hyperparameter_search
+
+import models
+
+
+class CPUActorWithClass:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config, model_cls):
+        model = model_cls(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+class MuZeroWithoutRB:
+    def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+            if save_path_ex:
+                config.results_path /= save_path_ex
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        self.model_cls = model_cls
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActorWithClass()
+        cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls)
+        self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights))
+
+
+    def logging_loop(self, writer, training_steps):
+        # writer = SummaryWriter(config.results_path)
+
+        # print(
+        #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        # )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # # Save model representation
+        # writer.add_text(
+        #     "Model summary",
+        #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
+        # )
+        # Loop for updating the training performance
+        counter = training_steps
+
+        try:
+            if True:
+            # while checkpoint["training_step"] < config.training_steps:
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    self.checkpoint["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    self.checkpoint["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    self.checkpoint["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    self.checkpoint["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    self.checkpoint["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    self.checkpoint["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    self.checkpoint["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter)
+                print(
+                    f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                # time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        # if config.save_model:
+        #     # Persist replay buffer to disk
+        #     path = config.results_path / "replay_buffer.pkl"
+        #     print(f"\n\nPersisting replay buffer games to disk at {path}")
+        #     pickle.dump(
+        #         {
+        #             "buffer": buffer,
+        #             "num_played_games": checkpoint["num_played_games"],
+        #             "num_played_steps": checkpoint["num_played_steps"],
+        #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
+        #         },
+        #         open(path, "wb"),
+        #     )
+
+    def update_gameplay_checkpoint(self, game_history):
+        self.checkpoint["episode_length"] = len(game_history.action_history) - 1
+        self.checkpoint["total_reward"] = sum(game_history.reward_history)
+        self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
+
+        if 1 < len(self.config.players):
+            self.checkpoint["muzero_reward"] = sum(
+                        reward
+                        for i, reward in enumerate(game_history.reward_history)
+                        if game_history.to_play_history[i - 1]
+                        == self.config.muzero_player
+                    )
+            self.checkpoint["opponent_reward"] = sum(
+                        reward
+                        for i, reward in enumerate(game_history.reward_history)
+                        if game_history.to_play_history[i - 1]
+                        != self.config.muzero_player
+                    )
+
+    def save_checkpoint(self, path=None): #将模型存储在文件中
+        if not path:
+            path = self.config.results_path / "model.checkpoint"
+
+        torch.save(self.checkpoint, path)
+
+    def train(self, log_in_tensorboard=True):
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+
+        trainer = Trainer(models.MuZeroNetwork, self.checkpoint, self.config)
+        game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed)
+        buffer = {}
+        play_buffer = PlayBuffer(self.checkpoint, buffer, self.config)
+
+        step = 1 # 间隔，即每次模拟后训练多少次
+        max_steps = int(self.config.training_steps/step)
+        # max_steps = 2000
+
+        writer = SummaryWriter(self.config.results_path)
+
+        for episode in range(max_steps):
+            game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0)
+
+            # print(game_id)
+            # print(game_history.action_history)
+            # print(game_history.reward_history)
+            # print(game_history.to_play_history)
+            # # print(game_history.observation_history)
+            # print("child visits", game_history.child_visits)
+            # print(game_history.root_values) # root value指的是root节点的UCB值
+
+            play_buffer.update_game_history(game_id, game_history)
+            self.update_gameplay_checkpoint( game_history)
+
+            for i in range(step):
+                index_batch, batch = play_buffer.get_batch()
+                # print(batch[1])
+                trainer.update_lr()
+                (
+                    priorities,
+                    total_loss,
+                    value_loss,
+                    reward_loss,
+                    policy_loss,
+                ) = trainer.update_weights(batch)
+
+
+                training_step = episode * step + i
+                if training_step % self.config.checkpoint_interval == 0:
+                    self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
+                    self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
+
+                    if self.config.save_model:
+                        self.save_checkpoint()
+                self.checkpoint["training_step"] = training_step
+                self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
+                self.checkpoint["total_loss"] = total_loss
+                self.checkpoint["value_loss"] = value_loss
+                self.checkpoint["reward_loss"] = reward_loss
+                self.checkpoint["policy_loss"] = policy_loss
+
+            # print(training_step)
+            # if training_step % 500 == 0:
+            # if training_step % config.checkpoint_interval == 0:
+            #     # print(training_step)
+            #     logging_loop(config, checkpoint, writer)
+
+            self.logging_loop(writer, training_step)
+
+
+        writer.close()
+
+        game_play.close_game()
+
+if __name__ == "__main__":
+    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+    # start_time = time.time()
+    # muzero.train()
+    # end_time = time.time()
+    # print("耗时: {:.2f}秒".format(end_time - start_time))
+    model_cls = models.MuZeroNetwork
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZeroWithoutRB(sys.argv[1], model_cls=model_cls)
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZeroWithoutRB(sys.argv[1], config, model_cls=model_cls)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZeroWithoutRB(game_name, model_cls=model_cls)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZeroWithoutRB(game_name, best_hyperparameters , model_cls=model_cls)
+            else:
+                break
+            print("\nDone")
diff --git a/muzero_without_replay_buffer_tictactoe.py b/muzero_without_replay_buffer_tictactoe.py
new file mode 100644
index 00000000..f64413ab
--- /dev/null
+++ b/muzero_without_replay_buffer_tictactoe.py
@@ -0,0 +1,242 @@
+from self_play import MCTS, GameHistory
+from games.tictactoe import MuZeroConfig, Game
+# from games.tictactoe import MuZeroConfig, Game
+import models
+
+import numpy
+import torch
+from torch.utils.tensorboard import SummaryWriter
+import pickle
+
+import math
+import time
+import copy
+
+from simplifiedMuZero.without_rb.game_play import GamePlay
+from simplifiedMuZero.without_rb.play_buffer import PlayBuffer
+from simplifiedMuZero.without_rb.trainer import Trainer
+
+def logging_loop(config, checkpoint, writer, training_steps):
+    # writer = SummaryWriter(config.results_path)
+
+    # print(
+    #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+    # )
+
+    # Save hyperparameters to TensorBoard
+    hp_table = [
+        f"| {key} | {value} |" for key, value in config.__dict__.items()
+    ]
+    writer.add_text(
+        "Hyperparameters",
+        "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+    )
+    # # Save model representation
+    # writer.add_text(
+    #     "Model summary",
+    #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
+    # )
+    # Loop for updating the training performance
+    counter = training_steps
+
+    try:
+        if True:
+        # while checkpoint["training_step"] < config.training_steps:
+            writer.add_scalar(
+                "1.Total_reward/1.Total_reward",
+                checkpoint["total_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/2.Mean_value",
+                checkpoint["mean_value"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/3.Episode_length",
+                checkpoint["episode_length"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/4.MuZero_reward",
+                checkpoint["muzero_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/5.Opponent_reward",
+                checkpoint["opponent_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/1.Self_played_games",
+                checkpoint["num_played_games"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/2.Training_steps", checkpoint["training_step"], counter
+            )
+            writer.add_scalar(
+                "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter
+            )
+            writer.add_scalar(
+                "2.Workers/4.Reanalysed_games",
+                checkpoint["num_reanalysed_games"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]),
+                counter,
+            )
+            writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter)
+            writer.add_scalar(
+                "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter
+            )
+            writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter)
+            writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter)
+            writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter)
+            print(
+                f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}',
+                end="\r",
+            )
+            counter += 1
+            # time.sleep(0.5)
+    except KeyboardInterrupt:
+        pass
+
+    # if config.save_model:
+    #     # Persist replay buffer to disk
+    #     path = config.results_path / "replay_buffer.pkl"
+    #     print(f"\n\nPersisting replay buffer games to disk at {path}")
+    #     pickle.dump(
+    #         {
+    #             "buffer": buffer,
+    #             "num_played_games": checkpoint["num_played_games"],
+    #             "num_played_steps": checkpoint["num_played_steps"],
+    #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
+    #         },
+    #         open(path, "wb"),
+    #     )
+
+def update_gameplay_checkpoint(config, checkpoint, game_history):
+    checkpoint["episode_length"] = len(game_history.action_history) - 1
+    checkpoint["total_reward"] = sum(game_history.reward_history)
+    checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
+
+    if 1 < len(config.players):
+        checkpoint["muzero_reward"] = sum(
+                    reward
+                    for i, reward in enumerate(game_history.reward_history)
+                    if game_history.to_play_history[i - 1]
+                    == config.muzero_player
+                )
+        checkpoint["opponent_reward"] = sum(
+                    reward
+                    for i, reward in enumerate(game_history.reward_history)
+                    if game_history.to_play_history[i - 1]
+                    != config.muzero_player
+                )
+
+def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中
+    if not path:
+        path = config.results_path / "model.checkpoint"
+
+    torch.save(checkpoint, path)
+
+def train(log_in_tensorboard=True):
+    config = MuZeroConfig()
+    config.results_path /= "muzero_without_rb"
+
+    if log_in_tensorboard or config.save_model:
+        config.results_path.mkdir(parents=True, exist_ok=True)
+
+    checkpoint = {
+        "weights": None,
+        "optimizer_state": None,
+        "total_reward": 0,
+        "muzero_reward": 0,
+        "opponent_reward": 0,
+        "episode_length": 0,
+        "mean_value": 0,
+        "training_step": 0,
+        "lr": 0,
+        "total_loss": 0,
+        "value_loss": 0,
+        "reward_loss": 0,
+        "policy_loss": 0,
+        "num_played_games": 0,
+        "num_played_steps": 0,
+        "num_reanalysed_games": 0,
+        "terminate": False,
+    }
+
+    trainer = Trainer(models.MuZeroNetwork, checkpoint, config)
+    selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed)
+    buffer = {}
+    play_buffer = PlayBuffer(checkpoint, buffer, config)
+
+    step = 1 # 间隔，即每次模拟后训练多少次
+    max_steps = int(config.training_steps/step)
+    # max_steps = 2000
+
+    writer = SummaryWriter(config.results_path)
+
+    for episode in range(max_steps):
+        game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0)
+
+        # print(game_id)
+        # print(game_history.action_history)
+        # print(game_history.reward_history)
+        # print(game_history.to_play_history)
+        # # print(game_history.observation_history)
+        # print("child visits", game_history.child_visits)
+        # print(game_history.root_values) # root value指的是root节点的UCB值
+
+        play_buffer.update_game_history(game_id, game_history)
+        update_gameplay_checkpoint(config, checkpoint, game_history)
+
+        for i in range(step):
+            index_batch, batch = play_buffer.get_batch()
+            # print(batch[1])
+            trainer.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = trainer.update_weights(batch)
+
+
+            training_step = episode * step + i
+            if training_step % config.checkpoint_interval == 0:
+                checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
+                checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
+
+                if config.save_model:
+                    save_checkpoint(config, checkpoint)
+            checkpoint["training_step"] = training_step
+            checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
+            checkpoint["total_loss"] = total_loss
+            checkpoint["value_loss"] = value_loss
+            checkpoint["reward_loss"] = reward_loss
+            checkpoint["policy_loss"] = policy_loss
+
+        # print(training_step)
+        # if training_step % 500 == 0:
+        # if training_step % config.checkpoint_interval == 0:
+        #     # print(training_step)
+        #     logging_loop(config, checkpoint, writer)
+
+        logging_loop(config, checkpoint, writer, training_step)
+
+
+    writer.close()
+
+    selfplay.close_game()
+
+if __name__ == "__main__":
+    start_time = time.time()
+    train()
+    end_time = time.time()
+    print("耗时: {:.2f}秒".format(end_time - start_time))
\ No newline at end of file
diff --git a/simplifiedMuZero/net2/trainer_2net.py b/simplifiedMuZero/net2/trainer_2net.py
index 19888cf2..567b8f9a 100644
--- a/simplifiedMuZero/net2/trainer_2net.py
+++ b/simplifiedMuZero/net2/trainer_2net.py
@@ -69,8 +69,6 @@ def continuous_update_weights(self, replay_buffer, shared_storage):
             shared_storage.get_info.remote("terminate")
         ):
             index_batch, batch = ray.get(next_batch)
-            print("train batch size is  :   ", batch[0].shape)
-            print("train index_batch size is  :   ", index_batch.shape)
             next_batch = replay_buffer.get_batch.remote()
             self.update_lr()
             (
diff --git a/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py b/simplifiedMuZero/search_policy/self_play_uniform_search.py
similarity index 91%
rename from simplifiedMuZero/without_rb/self_play_without_replay_buffer.py
rename to simplifiedMuZero/search_policy/self_play_uniform_search.py
index 7e0d6512..314249f0 100644
--- a/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py
+++ b/simplifiedMuZero/search_policy/self_play_uniform_search.py
@@ -2,13 +2,13 @@
 import time
 
 import numpy
-# import ray
+import ray
 import torch
 
-import simplifiedMuZero.without_rb.models_without_replay_buffer as models
+import models
 
 
-# @ray.remote
+@ray.remote
 class SelfPlay:
     """
     Class which run in a dedicated thread to play games and save them to the replay-buffer.
@@ -107,9 +107,6 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
 
         self.close_game()
 
-    # play game 与continuous self play 的区别：
-    #   1. play game 是实际运行游戏，游戏的结果存在game history里，不向replay buffer里写
-    #   2. continuous self play 调用play game，把获取到的game history 异步写进 replay buffer
     #play game 运行
     # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
     # 运行步骤：
@@ -131,7 +128,7 @@ def play_game(
         game_history.action_history.append(0)
         game_history.observation_history.append(observation) # 添加reset之后的observation
         game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
+        game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的
 
         done = False
 
@@ -157,7 +154,7 @@ def play_game(
                 # 一下的if-else部分主要是为了选择一个动作
                 # Choose the action
                 if opponent == "self" or muzero_player == self.game.to_play():
-                    root, mcts_info = MCTS(self.config).run(
+                    root, mcts_info = UniformSearch(self.config).run(
                         self.model,
                         stacked_observations,
                         self.game.legal_actions(),
@@ -206,7 +203,7 @@ def select_opponent_action(self, opponent, stacked_observations):
         Select opponent action for evaluating MuZero level.
         """
         if opponent == "human":
-            root, mcts_info = MCTS(self.config).run(
+            root, mcts_info = UniformSearch(self.config).run(
                 self.model,
                 stacked_observations,
                 self.game.legal_actions(),
@@ -267,7 +264,7 @@ def select_action(node, temperature):
 
 
 # Game independent
-class MCTS:
+class UniformSearch:
     """
     Core Monte Carlo Tree Search algorithm.
     To decide on an action, we run N simulations, always starting at the root of
@@ -411,46 +408,47 @@ def select_child(self, node, min_max_stats):
         """
         Select the child with the highest UCB score.
         """
-        max_ucb = max(
-            self.ucb_score(node, child, min_max_stats)
-            for action, child in node.children.items()
-        )
-        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
-            [
-                action
-                for action, child in node.children.items()
-                if self.ucb_score(node, child, min_max_stats) == max_ucb
-            ]
-        )
+        # max_ucb = max(
+        #     self.ucb_score(node, child, min_max_stats)
+        #     for action, child in node.children.items()
+        # )
+        # action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
+        #     [
+        #         action
+        #         for action, child in node.children.items()
+        #         if self.ucb_score(node, child, min_max_stats) == max_ucb
+        #     ]
+        # )
+        action = numpy.random.choice([action for action,child in node.children.items()])
         return action, node.children[action]
 
-    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
-        """
-        The score for a node is based on its value, plus an exploration bonus based on the prior.
-        """
-        pb_c = (
-            math.log(
-                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
-            )
-            + self.config.pb_c_init
-        )
-        pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
-
-        prior_score = pb_c * child.prior # prior 之前的p_value
-        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
-        # prior_score = pbc * prior
-
-        if child.visit_count > 0:
-            # Mean value Q
-            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
-                child.reward
-                + self.config.discount # 衰减系数， 之后乘以子节点的值
-                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
-            )
-        else:
-            value_score = 0
-
-        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
+    # def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
+    #     """
+    #     The score for a node is based on its value, plus an exploration bonus based on the prior.
+    #     """
+    #     pb_c = (
+    #         math.log(
+    #             (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
+    #         )
+    #         + self.config.pb_c_init
+    #     )
+    #     pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
+    #
+    #     prior_score = pb_c * child.prior # prior 之前的p_value
+    #     # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+    #     # prior_score = pbc * prior
+    #
+    #     if child.visit_count > 0:
+    #         # Mean value Q
+    #         value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
+    #             child.reward
+    #             + self.config.discount # 衰减系数， 之后乘以子节点的值
+    #             * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
+    #         )
+    #     else:
+    #         value_score = 0
+    #
+    #     return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
 
     # 反向传播算法
     # 对路径上的所有访问次数+1，value值加reward
diff --git a/simplifiedMuZero/without_rb/game_play.py b/simplifiedMuZero/without_rb/game_play.py
new file mode 100644
index 00000000..b0304d64
--- /dev/null
+++ b/simplifiedMuZero/without_rb/game_play.py
@@ -0,0 +1,182 @@
+import numpy
+import torch
+from self_play import GameHistory, MCTS
+class GamePlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, model, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        # self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(initial_checkpoint["weights"])
+        self.model = model
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+        self.trained_steps = initial_checkpoint["training_step"]
+        self.terminate = False
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+        game_id = None
+
+        if render:
+            self.game.render()
+
+        game_id = self.game.to_play()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_id, game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
\ No newline at end of file
diff --git a/simplifiedMuZero/without_rb/models_without_replay_buffer.py b/simplifiedMuZero/without_rb/models_without_replay_buffer.py
deleted file mode 100644
index d4b8bc2f..00000000
--- a/simplifiedMuZero/without_rb/models_without_replay_buffer.py
+++ /dev/null
@@ -1,696 +0,0 @@
-import math
-from abc import ABC, abstractmethod
-
-import torch
-
-
-class MuZeroNetwork:
-    def __new__(cls, config):
-        if config.network == "fullyconnected":
-            return MuZeroFullyConnectedNetwork(
-                config.observation_shape,
-                config.stacked_observations,
-                len(config.action_space),
-                config.encoding_size,
-                config.fc_reward_layers,
-                config.fc_value_layers,
-                config.fc_policy_layers,
-                config.fc_representation_layers,
-                config.fc_dynamics_layers,
-                config.support_size,
-            )
-        elif config.network == "resnet":
-            return MuZeroResidualNetwork(
-                config.observation_shape,
-                config.stacked_observations,
-                len(config.action_space),
-                config.blocks,
-                config.channels,
-                config.reduced_channels_reward,
-                config.reduced_channels_value,
-                config.reduced_channels_policy,
-                config.resnet_fc_reward_layers,
-                config.resnet_fc_value_layers,
-                config.resnet_fc_policy_layers,
-                config.support_size,
-                config.downsample,
-            )
-        else:
-            raise NotImplementedError(
-                'The network parameter should be "fullyconnected" or "resnet".'
-            )
-
-
-def dict_to_cpu(dictionary):
-    cpu_dict = {}
-    for key, value in dictionary.items():
-        if isinstance(value, torch.Tensor):
-            cpu_dict[key] = value.cpu()
-        elif isinstance(value, dict):
-            cpu_dict[key] = dict_to_cpu(value)
-        else:
-            cpu_dict[key] = value
-    return cpu_dict
-
-
-class AbstractNetwork(ABC, torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        pass
-
-    @abstractmethod
-    def initial_inference(self, observation):
-        pass
-
-    @abstractmethod
-    def recurrent_inference(self, encoded_state, action):
-        pass
-
-    def get_weights(self):
-        return dict_to_cpu(self.state_dict())
-
-    def set_weights(self, weights):
-        self.load_state_dict(weights)
-
-
-##################################
-######## Fully Connected #########
-
-
-class MuZeroFullyConnectedNetwork(AbstractNetwork):
-    def __init__(
-        self,
-        observation_shape,
-        stacked_observations,
-        action_space_size,
-        encoding_size,
-        fc_reward_layers,
-        fc_value_layers,
-        fc_policy_layers,
-        fc_representation_layers,
-        fc_dynamics_layers,
-        support_size,
-    ):
-        super().__init__()
-        self.action_space_size = action_space_size
-        self.full_support_size = 2 * support_size + 1
-        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
-
-        self.representation_network = torch.nn.DataParallel(
-            mlp(
-                observation_shape[0]
-                * observation_shape[1]
-                * observation_shape[2]
-                * (stacked_observations + 1)
-                + stacked_observations * observation_shape[1] * observation_shape[2],
-                fc_representation_layers,
-                encoding_size,
-            )
-        )
-
-        #dynamics的输入是encoding_size+action_space_size
-        self.dynamics_encoded_state_network = torch.nn.DataParallel(
-            mlp(
-                encoding_size + self.action_space_size,
-                fc_dynamics_layers,
-                encoding_size,
-            )
-        )
-        self.dynamics_reward_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
-        )
-
-        self.prediction_policy_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
-        )
-        self.prediction_value_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
-        )
-
-    def prediction(self, encoded_state):
-        policy_logits = self.prediction_policy_network(encoded_state)
-        value = self.prediction_value_network(encoded_state)
-        return policy_logits, value
-
-    def representation(self, observation):
-        encoded_state = self.representation_network(
-            observation.view(observation.shape[0], -1)
-        )
-
-        # 正则化
-        # Scale encoded state between [0, 1] (See appendix paper Training)
-        min_encoded_state = encoded_state.min(1, keepdim=True)[0]
-        max_encoded_state = encoded_state.max(1, keepdim=True)[0]
-        scale_encoded_state = max_encoded_state - min_encoded_state
-        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
-        encoded_state_normalized = (
-            encoded_state - min_encoded_state
-        ) / scale_encoded_state
-        return encoded_state_normalized
-
-    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
-    def dynamics(self, encoded_state, action):
-        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
-        action_one_hot = (
-            torch.zeros((action.shape[0], self.action_space_size))
-            .to(action.device)
-            .float()
-        )
-        action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1
-        x = torch.cat((encoded_state, action_one_hot), dim=1)
-
-        next_encoded_state = self.dynamics_encoded_state_network(x)
-
-        reward = self.dynamics_reward_network(next_encoded_state)
-
-        # 正则化
-        # Scale encoded state between [0, 1] (See paper appendix Training)
-        min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0]
-        max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0]
-        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
-        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
-        next_encoded_state_normalized = (
-            next_encoded_state - min_next_encoded_state
-        ) / scale_next_encoded_state
-
-        return next_encoded_state_normalized, reward
-
-    def initial_inference(self, observation):
-        encoded_state = self.representation(observation)
-        policy_logits, value = self.prediction(encoded_state)
-        # reward equal to 0 for consistency 一致性奖励等于 0
-        reward = torch.log(
-            (
-                torch.zeros(1, self.full_support_size)
-                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
-                .repeat(len(observation), 1)
-                .to(observation.device)
-            )
-        )
-        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
-
-        return (
-            value,
-            reward,
-            policy_logits,
-            encoded_state,
-        )
-
-    def recurrent_inference(self, encoded_state, action):
-        next_encoded_state, reward = self.dynamics(encoded_state, action)
-        policy_logits, value = self.prediction(next_encoded_state)
-        return value, reward, policy_logits, next_encoded_state
-
-
-###### End Fully Connected #######
-##################################
-
-
-##################################
-############# ResNet #############
-
-
-def conv3x3(in_channels, out_channels, stride=1):
-    return torch.nn.Conv2d(
-        in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False
-    )
-
-
-# Residual block
-class ResidualBlock(torch.nn.Module):
-    def __init__(self, num_channels, stride=1):
-        super().__init__()
-        self.conv1 = conv3x3(num_channels, num_channels, stride)
-        self.bn1 = torch.nn.BatchNorm2d(num_channels)
-        self.conv2 = conv3x3(num_channels, num_channels)
-        self.bn2 = torch.nn.BatchNorm2d(num_channels)
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = torch.nn.functional.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out += x
-        out = torch.nn.functional.relu(out)
-        return out
-
-
-# Downsample observations before representation network (See paper appendix Network Architecture)
-class DownSample(torch.nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.conv1 = torch.nn.Conv2d(
-            in_channels,
-            out_channels // 2,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False,
-        )
-        self.resblocks1 = torch.nn.ModuleList(
-            [ResidualBlock(out_channels // 2) for _ in range(2)]
-        )
-        self.conv2 = torch.nn.Conv2d(
-            out_channels // 2,
-            out_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False,
-        )
-        self.resblocks2 = torch.nn.ModuleList(
-            [ResidualBlock(out_channels) for _ in range(3)]
-        )
-        self.pooling1 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
-        self.resblocks3 = torch.nn.ModuleList(
-            [ResidualBlock(out_channels) for _ in range(3)]
-        )
-        self.pooling2 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        for block in self.resblocks1:
-            x = block(x)
-        x = self.conv2(x)
-        for block in self.resblocks2:
-            x = block(x)
-        x = self.pooling1(x)
-        for block in self.resblocks3:
-            x = block(x)
-        x = self.pooling2(x)
-        return x
-
-
-class DownsampleCNN(torch.nn.Module):
-    def __init__(self, in_channels, out_channels, h_w):
-        super().__init__()
-        mid_channels = (in_channels + out_channels) // 2
-        self.features = torch.nn.Sequential(
-            torch.nn.Conv2d(
-                in_channels, mid_channels, kernel_size=h_w[0] * 2, stride=4, padding=2
-            ),
-            torch.nn.ReLU(inplace=True),
-            torch.nn.MaxPool2d(kernel_size=3, stride=2),
-            torch.nn.Conv2d(mid_channels, out_channels, kernel_size=5, padding=2),
-            torch.nn.ReLU(inplace=True),
-            torch.nn.MaxPool2d(kernel_size=3, stride=2),
-        )
-        self.avgpool = torch.nn.AdaptiveAvgPool2d(h_w)
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.avgpool(x)
-        return x
-
-
-class RepresentationNetwork(torch.nn.Module):
-    def __init__(
-        self,
-        observation_shape,
-        stacked_observations,
-        num_blocks,
-        num_channels,
-        downsample,
-    ):
-        super().__init__()
-        self.downsample = downsample
-        if self.downsample:
-            if self.downsample == "resnet":
-                self.downsample_net = DownSample(
-                    observation_shape[0] * (stacked_observations + 1)
-                    + stacked_observations,
-                    num_channels,
-                )
-            elif self.downsample == "CNN":
-                self.downsample_net = DownsampleCNN(
-                    observation_shape[0] * (stacked_observations + 1)
-                    + stacked_observations,
-                    num_channels,
-                    (
-                        math.ceil(observation_shape[1] / 16),
-                        math.ceil(observation_shape[2] / 16),
-                    ),
-                )
-            else:
-                raise NotImplementedError('downsample should be "resnet" or "CNN".')
-        self.conv = conv3x3(
-            observation_shape[0] * (stacked_observations + 1) + stacked_observations,
-            num_channels,
-        )
-        self.bn = torch.nn.BatchNorm2d(num_channels)
-        self.resblocks = torch.nn.ModuleList(
-            [ResidualBlock(num_channels) for _ in range(num_blocks)]
-        )
-
-    def forward(self, x):
-        if self.downsample:
-            x = self.downsample_net(x)
-        else:
-            x = self.conv(x)
-            x = self.bn(x)
-            x = torch.nn.functional.relu(x)
-
-        for block in self.resblocks:
-            x = block(x)
-        return x
-
-
-class DynamicsNetwork(torch.nn.Module):
-    def __init__(
-        self,
-        num_blocks,
-        num_channels,
-        reduced_channels_reward,
-        fc_reward_layers,
-        full_support_size,
-        block_output_size_reward,
-    ):
-        super().__init__()
-        self.conv = conv3x3(num_channels, num_channels - 1)
-        self.bn = torch.nn.BatchNorm2d(num_channels - 1)
-        self.resblocks = torch.nn.ModuleList(
-            [ResidualBlock(num_channels - 1) for _ in range(num_blocks)]
-        )
-
-        self.conv1x1_reward = torch.nn.Conv2d(
-            num_channels - 1, reduced_channels_reward, 1
-        )
-        self.block_output_size_reward = block_output_size_reward
-        self.fc = mlp(
-            self.block_output_size_reward,
-            fc_reward_layers,
-            full_support_size,
-        )
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = torch.nn.functional.relu(x)
-        for block in self.resblocks:
-            x = block(x)
-        state = x
-        x = self.conv1x1_reward(x)
-        x = x.view(-1, self.block_output_size_reward)
-        reward = self.fc(x)
-        return state, reward
-
-
-class PredictionNetwork(torch.nn.Module):
-    def __init__(
-        self,
-        action_space_size,
-        num_blocks,
-        num_channels,
-        reduced_channels_value,
-        reduced_channels_policy,
-        fc_value_layers,
-        fc_policy_layers,
-        full_support_size,
-        block_output_size_value,
-        block_output_size_policy,
-    ):
-        super().__init__()
-        self.resblocks = torch.nn.ModuleList(
-            [ResidualBlock(num_channels) for _ in range(num_blocks)]
-        )
-
-        self.conv1x1_value = torch.nn.Conv2d(num_channels, reduced_channels_value, 1)
-        self.conv1x1_policy = torch.nn.Conv2d(num_channels, reduced_channels_policy, 1)
-        self.block_output_size_value = block_output_size_value
-        self.block_output_size_policy = block_output_size_policy
-        self.fc_value = mlp(
-            self.block_output_size_value, fc_value_layers, full_support_size
-        )
-        self.fc_policy = mlp(
-            self.block_output_size_policy,
-            fc_policy_layers,
-            action_space_size,
-        )
-
-    def forward(self, x):
-        for block in self.resblocks:
-            x = block(x)
-        value = self.conv1x1_value(x)
-        policy = self.conv1x1_policy(x)
-        value = value.view(-1, self.block_output_size_value)
-        policy = policy.view(-1, self.block_output_size_policy)
-        value = self.fc_value(value)
-        policy = self.fc_policy(policy)
-        return policy, value
-
-
-class MuZeroResidualNetwork(AbstractNetwork):
-    def __init__(
-        self,
-        observation_shape,
-        stacked_observations,
-        action_space_size,
-        num_blocks,
-        num_channels,
-        reduced_channels_reward,
-        reduced_channels_value,
-        reduced_channels_policy,
-        fc_reward_layers,
-        fc_value_layers,
-        fc_policy_layers,
-        support_size,
-        downsample,
-    ):
-        super().__init__()
-        self.action_space_size = action_space_size
-        self.full_support_size = 2 * support_size + 1
-        block_output_size_reward = (
-            (
-                reduced_channels_reward
-                * math.ceil(observation_shape[1] / 16)
-                * math.ceil(observation_shape[2] / 16)
-            )
-            if downsample
-            else (reduced_channels_reward * observation_shape[1] * observation_shape[2])
-        )
-
-        block_output_size_value = (
-            (
-                reduced_channels_value
-                * math.ceil(observation_shape[1] / 16)
-                * math.ceil(observation_shape[2] / 16)
-            )
-            if downsample
-            else (reduced_channels_value * observation_shape[1] * observation_shape[2])
-        )
-
-        block_output_size_policy = (
-            (
-                reduced_channels_policy
-                * math.ceil(observation_shape[1] / 16)
-                * math.ceil(observation_shape[2] / 16)
-            )
-            if downsample
-            else (reduced_channels_policy * observation_shape[1] * observation_shape[2])
-        )
-
-        self.representation_network = torch.nn.DataParallel(
-            RepresentationNetwork(
-                observation_shape,
-                stacked_observations,
-                num_blocks,
-                num_channels,
-                downsample,
-            )
-        )
-
-        self.dynamics_network = torch.nn.DataParallel(
-            DynamicsNetwork(
-                num_blocks,
-                num_channels + 1,
-                reduced_channels_reward,
-                fc_reward_layers,
-                self.full_support_size,
-                block_output_size_reward,
-            )
-        )
-
-        self.prediction_network = torch.nn.DataParallel(
-            PredictionNetwork(
-                action_space_size,
-                num_blocks,
-                num_channels,
-                reduced_channels_value,
-                reduced_channels_policy,
-                fc_value_layers,
-                fc_policy_layers,
-                self.full_support_size,
-                block_output_size_value,
-                block_output_size_policy,
-            )
-        )
-
-    def prediction(self, encoded_state):
-        policy, value = self.prediction_network(encoded_state)
-        return policy, value
-
-    def representation(self, observation):
-        encoded_state = self.representation_network(observation)
-
-        # Scale encoded state between [0, 1] (See appendix paper Training)
-        min_encoded_state = (
-            encoded_state.view(
-                -1,
-                encoded_state.shape[1],
-                encoded_state.shape[2] * encoded_state.shape[3],
-            )
-            .min(2, keepdim=True)[0]
-            .unsqueeze(-1)
-        )
-        max_encoded_state = (
-            encoded_state.view(
-                -1,
-                encoded_state.shape[1],
-                encoded_state.shape[2] * encoded_state.shape[3],
-            )
-            .max(2, keepdim=True)[0]
-            .unsqueeze(-1)
-        )
-        scale_encoded_state = max_encoded_state - min_encoded_state
-        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
-        encoded_state_normalized = (
-            encoded_state - min_encoded_state
-        ) / scale_encoded_state
-        return encoded_state_normalized
-
-    def dynamics(self, encoded_state, action):
-        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
-        action_one_hot = (
-            torch.ones(
-                (
-                    encoded_state.shape[0],
-                    1,
-                    encoded_state.shape[2],
-                    encoded_state.shape[3],
-                )
-            )
-            .to(action.device)
-            .float()
-        )
-        action_one_hot = (
-            action[:, :, None, None] * action_one_hot / self.action_space_size
-        )
-        x = torch.cat((encoded_state, action_one_hot), dim=1)
-        next_encoded_state, reward = self.dynamics_network(x)
-
-        # Scale encoded state between [0, 1] (See paper appendix Training)
-        min_next_encoded_state = (
-            next_encoded_state.view(
-                -1,
-                next_encoded_state.shape[1],
-                next_encoded_state.shape[2] * next_encoded_state.shape[3],
-            )
-            .min(2, keepdim=True)[0]
-            .unsqueeze(-1)
-        )
-        max_next_encoded_state = (
-            next_encoded_state.view(
-                -1,
-                next_encoded_state.shape[1],
-                next_encoded_state.shape[2] * next_encoded_state.shape[3],
-            )
-            .max(2, keepdim=True)[0]
-            .unsqueeze(-1)
-        )
-        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
-        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
-        next_encoded_state_normalized = (
-            next_encoded_state - min_next_encoded_state
-        ) / scale_next_encoded_state
-        return next_encoded_state_normalized, reward
-
-    def initial_inference(self, observation):
-        encoded_state = self.representation(observation)
-        policy_logits, value = self.prediction(encoded_state)
-        # reward equal to 0 for consistency
-        reward = torch.log(
-            (
-                torch.zeros(1, self.full_support_size)
-                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
-                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
-                .to(observation.device)
-            )
-        )
-        return (
-            value,
-            reward,
-            policy_logits,
-            encoded_state,
-        )
-
-    def recurrent_inference(self, encoded_state, action):
-        next_encoded_state, reward = self.dynamics(encoded_state, action)
-        policy_logits, value = self.prediction(next_encoded_state)
-        return value, reward, policy_logits, next_encoded_state
-
-
-########### End ResNet ###########
-##################################
-
-
-def mlp(
-    input_size,
-    layer_sizes,
-    output_size,
-    output_activation=torch.nn.Identity,
-    activation=torch.nn.ELU,
-):
-    sizes = [input_size] + layer_sizes + [output_size]
-    layers = []
-    for i in range(len(sizes) - 1):
-        act = activation if i < len(sizes) - 2 else output_activation #激活函数，最后一层是output_activation，其余的都一样
-        layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
-    return torch.nn.Sequential(*layers)
-
-
-def support_to_scalar(logits, support_size): # logits 是 value的对数值，support_size是转换后的范围。
-    """
-    Transform a categorical representation to a scalar
-    See paper appendix Network Architecture
-    """
-    # Decode to a scalar
-    probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1，softmax扩大大的，缩小下的，shape为[stacked_size, fully_support_size]
-    support = (
-        torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
-        .expand(probabilities.shape)
-        .float()
-        .to(device=probabilities.device)
-    ) # shape 为【stacked_size, fully_support_size】，
-    x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1，fully_support_size】，因为dim=1，另外keep_dim=True，所有是【1，fully_support_size】而不是【fully_support_size]
-
-    # Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
-    x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1，大于0为1，0为0。主要是获取x的符号
-        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
-        ** 2
-        - 1
-    )
-    return x
-
-
-def scalar_to_support(x, support_size):
-    """
-    Transform a scalar to a categorical representation with (2 * support_size + 1) categories
-    See paper appendix Network Architecture
-    """
-    # Reduce the scale (defined in https://arxiv.org/abs/1805.11593)
-    x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x
-
-    # Encode on a vector
-    x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围，使x的范围定为[-support_size, support_size]
-    floor = x.floor() # floor向下取整，类似的，ceil为向上取整
-    prob = x - floor # 减去整数，保留小数部分（因为在support_to_scala部分是index位置乘上概率)
-    logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
-    logits.scatter_(
-        2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)
-    )
-    indexes = floor + support_size + 1
-    prob = prob.masked_fill_(2 * support_size < indexes, 0.0)
-    indexes = indexes.masked_fill_(2 * support_size < indexes, 0.0)
-    logits.scatter_(2, indexes.long().unsqueeze(-1), prob.unsqueeze(-1))
-    return logits
diff --git a/simplifiedMuZero/without_rb/play_buffer.py b/simplifiedMuZero/without_rb/play_buffer.py
new file mode 100644
index 00000000..ad13a67f
--- /dev/null
+++ b/simplifiedMuZero/without_rb/play_buffer.py
@@ -0,0 +1,214 @@
+import numpy
+import torch
+import copy
+class PlayBuffer:
+    """
+    Class which run in a dedicated thread to store played games and generate batch.
+    """
+
+    def __init__(self, initial_checkpoint, initial_buffer, config):
+        self.config = config
+        self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{}
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
+        )
+        if self.total_samples != 0:
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+
+    def save_game(self, game_history):
+        self.buffer[self.num_played_games] = game_history
+        self.num_played_games += 1
+        self.num_played_steps += len(game_history.root_values)
+        self.total_samples += len(game_history.root_values)
+
+        if self.config.replay_buffer_size < len(self.buffer):
+            del_id = self.num_played_games - len(self.buffer)
+            self.total_samples -= len(self.buffer[del_id].root_values)
+            del self.buffer[del_id]
+
+    def get_buffer(self):
+        return self.buffer
+
+    def get_batch(self):
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [])
+        weight_batch = None
+
+        for game_id, game_history, game_prob in self.sample_n_games(
+            self.config.batch_size
+        ):
+            game_pos, pos_prob = self.sample_position(game_history)
+
+            values, rewards, policies, actions = self.make_target(
+                game_history, game_pos
+            )
+
+            index_batch.append([game_id, game_pos])
+            observation_batch.append(
+                game_history.get_stacked_observations(
+                    game_pos,
+                    self.config.stacked_observations,
+                    len(self.config.action_space),
+                )
+            )
+            action_batch.append(actions)
+            value_batch.append(values)
+            reward_batch.append(rewards)
+            policy_batch.append(policies)
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
+            )
+
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1
+        # value_batch: batch, num_unroll_steps+1
+        # reward_batch: batch, num_unroll_steps+1
+        # policy_batch: batch, num_unroll_steps+1, len(action_space)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
+        )
+
+    def sample_game(self, force_uniform=True): #将force_uniform 设置为True，强制安装平均分布选取
+        """
+        Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        game_prob = None
+
+        game_index = numpy.random.choice(len(self.buffer))
+        game_id = self.num_played_games - len(self.buffer) + game_index
+
+        return game_id, self.buffer[game_id], game_prob
+
+    def sample_n_games(self, n_games):
+        selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+        game_prob_dict = {}
+        ret = [
+            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+            for game_id in selected_games
+        ]
+        return ret
+
+    def sample_position(self, game_history):
+        """
+        Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        position_prob = None
+
+        position_index = numpy.random.choice(len(game_history.root_values))
+
+        return position_index, position_prob
+
+    def update_game_history(self, game_id, game_history):
+        # The element could have been removed since its selection and update
+        # if next(iter(self.buffer)) <= game_id:
+        #     self.buffer[game_id] = game_history
+
+        self.buffer[game_id] = game_history
+
+    def compute_target_value(self, game_history, index):
+        # The value target is the discounted root value of the search tree td_steps into the
+        # future, plus the discounted sum of all rewards until then.
+        bootstrap_index = index + self.config.td_steps
+        if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            last_step_value = (
+                root_values[bootstrap_index]
+                if game_history.to_play_history[bootstrap_index]
+                == game_history.to_play_history[index]
+                else -root_values[bootstrap_index]
+            )
+
+            value = last_step_value * self.config.discount**self.config.td_steps
+        else:
+            value = 0
+
+        for i, reward in enumerate(
+            game_history.reward_history[index + 1 : bootstrap_index + 1]
+        ):
+            # The value is oriented from the perspective of the current player
+            value += (
+                reward
+                if game_history.to_play_history[index]
+                == game_history.to_play_history[index + i]
+                else -reward
+            ) * self.config.discount**i
+
+        return value
+
+    def make_target(self, game_history, state_index):
+        """
+        Generate targets for every unroll steps.
+        """
+        target_values, target_rewards, target_policies, actions = [], [], [], []
+        for current_index in range(
+            state_index, state_index + self.config.num_unroll_steps + 1
+        ):
+            value = self.compute_target_value(game_history, current_index)
+
+            if current_index < len(game_history.root_values):
+                target_values.append(value)
+                target_rewards.append(game_history.reward_history[current_index])
+                target_policies.append(game_history.child_visits[current_index])
+                actions.append(game_history.action_history[current_index])
+            elif current_index == len(game_history.root_values):
+                target_values.append(0)
+                target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(game_history.action_history[current_index])
+            else:
+                # States past the end of games are treated as absorbing states
+                target_values.append(0)
+                target_rewards.append(0)
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(numpy.random.choice(self.config.action_space))
+
+        return target_values, target_rewards, target_policies, actions
diff --git a/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py b/simplifiedMuZero/without_rb/trainer.py
similarity index 67%
rename from simplifiedMuZero/without_rb/trainer_without_replay_buffer.py
rename to simplifiedMuZero/without_rb/trainer.py
index e2f64fa2..265b13c5 100644
--- a/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py
+++ b/simplifiedMuZero/without_rb/trainer.py
@@ -1,21 +1,14 @@
-import copy
-import time
-
 import numpy
-# import ray
 import torch
+import models
 
-import simplifiedMuZero.without_rb.models_without_replay_buffer as models
-
-
-@ray.remote
 class Trainer:
     """
     Class which run in a dedicated thread to train a neural network and save it
     in the shared storage.
     """
 
-    def __init__(self, initial_checkpoint, config):
+    def __init__(self, model_cls, initial_checkpoint, config):
         self.config = config
 
         # Fix random generator seed
@@ -23,8 +16,8 @@ def __init__(self, initial_checkpoint, config):
         torch.manual_seed(self.config.seed)
 
         # Initialize the network
-        self.model = models.MuZeroNetwork(self.config)
-        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model = model_cls(self.config)
+        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
         self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
         self.model.train()
 
@@ -52,77 +45,29 @@ def __init__(self, initial_checkpoint, config):
                 f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
             )
 
-        if initial_checkpoint["optimizer_state"] is not None:
-            print("Loading optimizer...\n")
-            self.optimizer.load_state_dict(
-                copy.deepcopy(initial_checkpoint["optimizer_state"])
-            )
-
-    # update weights 与 continuous update weights 的区别
-    #   1. update weights 是实际计算更新network的权重
-    #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
-    def continuous_update_weights(self, replay_buffer, shared_storage):
-        # Wait for the replay buffer to be filled
-        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
-            time.sleep(0.1)
-
-        next_batch = replay_buffer.get_batch.remote()
-        # Training loop
-        while self.training_step < self.config.training_steps and not ray.get(
-            shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
-        ):
-            index_batch, batch = ray.get(next_batch)
-            next_batch = replay_buffer.get_batch.remote()
-            self.update_lr()
-            (
-                priorities,
-                total_loss,
-                value_loss,
-                reward_loss,
-                policy_loss,
-            ) = self.update_weights(batch)
-
-            if self.config.PER:
-                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
-                replay_buffer.update_priorities.remote(priorities, index_batch)
-
-            # Save to the shared storage
-            if self.training_step % self.config.checkpoint_interval == 0:
-                shared_storage.set_info.remote(
-                    {
-                        "weights": copy.deepcopy(self.model.get_weights()),
-                        "optimizer_state": copy.deepcopy(
-                            models.dict_to_cpu(self.optimizer.state_dict())
-                        ),
-                    }
-                )
-                if self.config.save_model:
-                    shared_storage.save_checkpoint.remote()
-            shared_storage.set_info.remote(
-                {
-                    "training_step": self.training_step,
-                    "lr": self.optimizer.param_groups[0]["lr"],
-                    "total_loss": total_loss,
-                    "value_loss": value_loss,
-                    "reward_loss": reward_loss,
-                    "policy_loss": policy_loss,
-                }
-            )
-
-            # Managing the self-play / training ratio
-            if self.config.training_delay:
-                time.sleep(self.config.training_delay)
-            if self.config.ratio:
-                while (
-                    self.training_step
-                    / max(
-                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
-                    )
-                    > self.config.ratio
-                    and self.training_step < self.config.training_steps
-                    and not ray.get(shared_storage.get_info.remote("terminate"))
-                ):
-                    time.sleep(0.5)
+        # if initial_checkpoint["optimizer_state"] is not None:
+        #     print("Loading optimizer...\n")
+        #     self.optimizer.load_state_dict(
+        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
+        #     )
+
+    # # update weights 与 continuous update weights 的区别
+    # #   1. update weights 是实际计算更新network的权重
+    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+    #     next_batch = play_buffer.get_batch()
+    #     # Training loop
+    #     while self.training_step < self.config.training_steps and not terminate:
+    #         index_batch, batch = next_batch
+    #         next_batch = play_buffer.get_batch()
+    #         self.update_lr()
+    #         (
+    #             priorities,
+    #             total_loss,
+    #             value_loss,
+    #             reward_loss,
+    #             policy_loss,
+    #         ) = self.update_weights(batch)
 
     def update_weights(self, batch):
         """
@@ -144,8 +89,6 @@ def update_weights(self, batch):
         priorities = numpy.zeros_like(target_value_scalar)
 
         device = next(self.model.parameters()).device
-        if self.config.PER:
-            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
         observation_batch = (
             torch.tensor(numpy.array(observation_batch)).float().to(device)
         )
@@ -254,9 +197,7 @@ def update_weights(self, batch):
 
         # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
         loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
-        if self.config.PER:
-            # Correct PER bias by using importance-sampling (IS) weights
-            loss *= weight_batch
+
         # Mean over batch dimension (pseudocode do a sum)
         loss = loss.mean()
 
@@ -297,7 +238,6 @@ def loss_function(
         # Cross-entropy seems to have a better convergence than MSE
         value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
         reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
-        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
-            1
-        )
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1)
+
         return value_loss, reward_loss, policy_loss
diff --git a/test/game_play_test.py b/test/game_play_test.py
index 60b6a5ec..78fdc4a5 100644
--- a/test/game_play_test.py
+++ b/test/game_play_test.py
@@ -675,7 +675,15 @@ def loss_function(
 
         # print(game_id)
         # print(game_history.action_history)
-        # print(game_history.reward_history)
+        print(game_history.reward_history)
+        muzero_reward = sum(
+            reward
+            for i, reward in enumerate(game_history.reward_history)
+            if game_history.to_play_history[i - 1]
+            == config.muzero_player
+        )
+
+        print(muzero_reward)
         # print(game_history.to_play_history)
         # # print(game_history.observation_history)
         # print("child visits", game_history.child_visits)
diff --git a/test/mcts_test.py b/test/mcts_test.py
new file mode 100644
index 00000000..d3edc0f3
--- /dev/null
+++ b/test/mcts_test.py
@@ -0,0 +1,245 @@
+import models
+from self_play import MCTS, GameHistory, Node, MinMaxStats
+from games.tictactoe import MuZeroConfig, Game
+
+import torch
+import numpy
+import math
+
+class MCTS1:
+    """
+    Core Monte Carlo Tree Search algorithm.
+    To decide on an action, we run N simulations, always starting at the root of
+    the search tree and traversing the tree according to the UCB formula until we
+    reach a leaf node.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
+    def run(
+        self,
+        model,
+        observation,
+        legal_actions,
+        to_play,
+        add_exploration_noise,
+        override_root_with=None,
+    ):
+        """
+        At the root of the search tree we use the representation function to obtain a
+        hidden state given the current observation.
+        We then run a Monte Carlo Tree Search using only action sequences and the model
+        learned by the network.
+        """
+        print(override_root_with)
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
+            root = override_root_with
+            root_predicted_value = None
+        else:
+            root = Node(0)
+            observation = (
+                torch.tensor(observation)
+                .float()
+                .unsqueeze(0)
+                .to(next(model.parameters()).device)
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
+            (
+                root_predicted_value,
+                reward,
+                policy_logits,
+                hidden_state,
+            ) = model.initial_inference(observation)
+            root_predicted_value = models.support_to_scalar(
+                root_predicted_value, self.config.support_size
+            ).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            assert (
+                legal_actions
+            ), f"Legal actions should not be an empty array. Got {legal_actions}."
+            assert set(legal_actions).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+            root.expand(
+                legal_actions,
+                to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+        if add_exploration_noise:
+            root.add_exploration_noise(
+                dirichlet_alpha=self.config.root_dirichlet_alpha,
+                exploration_fraction=self.config.root_exploration_fraction,
+            )
+
+        min_max_stats = MinMaxStats()
+
+        max_tree_depth = 0
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
+            virtual_to_play = to_play
+            node = root
+            search_path = [node]
+            current_tree_depth = 0
+
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
+                current_tree_depth += 1
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
+
+                # Players play turn by turn
+                if virtual_to_play + 1 < len(self.config.players):
+                    virtual_to_play = self.config.players[virtual_to_play + 1]
+                else:
+                    virtual_to_play = self.config.players[0]
+
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
+            # Inside the search tree we use the dynamics function to obtain the next hidden
+            # state given an action and the previous hidden state
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
+            value, reward, policy_logits, hidden_state = model.recurrent_inference(
+                parent.hidden_state,
+                torch.tensor([[action]]).to(parent.hidden_state.device),
+            )
+            value = models.support_to_scalar(value, self.config.support_size).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
+            node.expand(
+                self.config.action_space,
+                virtual_to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+            self.backpropagate(search_path, value, virtual_to_play, min_max_stats)
+
+            max_tree_depth = max(max_tree_depth, current_tree_depth)
+
+        extra_info = {
+            "max_tree_depth": max_tree_depth,
+            "root_predicted_value": root_predicted_value,
+        }
+        return root, extra_info
+
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
+    def select_child(self, node, min_max_stats):
+        """
+        Select the child with the highest UCB score.
+        """
+        max_ucb = max(
+            self.ucb_score(node, child, min_max_stats)
+            for action, child in node.children.items()
+        )
+        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
+            [
+                action
+                for action, child in node.children.items()
+                if self.ucb_score(node, child, min_max_stats) == max_ucb
+            ]
+        )
+        return action, node.children[action]
+
+    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
+        """
+        The score for a node is based on its value, plus an exploration bonus based on the prior.
+        """
+        pb_c = (
+            math.log(
+                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
+            )
+            + self.config.pb_c_init
+        )
+        pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
+
+        prior_score = pb_c * child.prior # prior 之前的p_value
+        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+        # prior_score = pbc * prior
+
+        if child.visit_count > 0:
+            # Mean value Q
+            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
+                child.reward
+                + self.config.discount # 衰减系数， 之后乘以子节点的值
+                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
+            )
+        else:
+            value_score = 0
+
+        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
+
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
+        """
+        At the end of a simulation, we propagate the evaluation all the way up the tree
+        to the root.
+        """
+        if len(self.config.players) == 1:
+            for node in reversed(search_path):
+                node.value_sum += value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * node.value())
+
+                value = node.reward + self.config.discount * value
+
+        elif len(self.config.players) == 2:
+            for node in reversed(search_path):
+                node.value_sum += value if node.to_play == to_play else -value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * -node.value())
+
+                value = (
+                    -node.reward if node.to_play == to_play else node.reward
+                ) + self.config.discount * value
+
+        else:
+            raise NotImplementedError("More than two player mode not implemented.")
+
+config = MuZeroConfig()
+game = Game(config.seed)
+
+game_history = GameHistory()
+
+observation = game.reset()
+
+game_history.action_history.append(0)
+game_history.observation_history.append(observation)  # 添加reset之后的observation
+game_history.reward_history.append(0)
+game_history.to_play_history.append(game.to_play())
+
+stacked_observations = game_history.get_stacked_observations( -1, config.stacked_observations, len(config.action_space))
+
+done = False
+
+model = models.MuZeroNetwork(config)
+
+root, mcts_info = MCTS1(config).run(model, stacked_observations, game.legal_actions(), game.to_play(), True)
+
+print(root)
+
+game.close()
\ No newline at end of file
diff --git a/test/muzero_config_test.py b/test/muzero_config_test.py
new file mode 100644
index 00000000..1b5fc135
--- /dev/null
+++ b/test/muzero_config_test.py
@@ -0,0 +1,6 @@
+from games.simple_grid import MuZeroConfig
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+    config.results_path /= "config_test"
+    print(config.results_path)
\ No newline at end of file
diff --git a/trainer.py b/trainer.py
index 3e035c51..849beaa2 100644
--- a/trainer.py
+++ b/trainer.py
@@ -279,7 +279,7 @@ def update_lr(self):
         lr = self.config.lr_init * self.config.lr_decay_rate ** (
             self.training_step / self.config.lr_decay_steps
         )
-        for param_group in self.optimizer.param_groups:
+        for param_group in self.optimizer.param_groups: # 更新optimizer的lr
             param_group["lr"] = lr
 
     @staticmethod

From 65ac04459be1d3fa04126c2668d7318467d18782 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Fri, 18 Aug 2023 05:55:10 +0100
Subject: [PATCH 5/9] simplified MuZero

---
 muzero_general.py                | 413 +++++++++++++++++++++++++++++++
 muzero_without_replay_buffer2.py | 331 +------------------------
 simplifiedMuZero/models2.py      | 366 +++++++++++++++++++++++----
 simplified_muzero.py             | 108 ++++++++
 simplified_muzero2.py            | 108 ++++++++
 test/deap_test.py                |  44 ++++
 6 files changed, 998 insertions(+), 372 deletions(-)
 create mode 100644 muzero_general.py
 create mode 100644 simplified_muzero.py
 create mode 100644 simplified_muzero2.py
 create mode 100644 test/deap_test.py

diff --git a/muzero_general.py b/muzero_general.py
new file mode 100644
index 00000000..6d8363d9
--- /dev/null
+++ b/muzero_general.py
@@ -0,0 +1,413 @@
+import importlib
+import ray
+import pathlib
+
+import numpy
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import math
+import copy
+
+from simplifiedMuZero.without_rb.game_play import GamePlay
+from simplifiedMuZero.without_rb.play_buffer import PlayBuffer
+from simplifiedMuZero.without_rb.trainer import Trainer
+from muzero import load_model_menu, hyperparameter_search
+
+import models
+
+
+class CPUActorWithClass:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config, model_cls):
+        model = model_cls(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+class MuZeroGeneral:
+    def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+            if save_path_ex:
+                self.config.results_path /= save_path_ex
+            else:
+                self.config.results_path /= model_cls.__name__
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        self.model_cls = model_cls
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActorWithClass()
+        cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls)
+        self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights))
+
+
+    def logging_loop(self, writer, training_steps):
+
+        # print(
+        #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        # )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # # Save model representation
+        # writer.add_text(
+        #     "Model summary",
+        #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
+        # )
+        # Loop for updating the training performance
+        counter = training_steps
+
+        try:
+            if True:
+            # while checkpoint["training_step"] < config.training_steps:
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    self.checkpoint["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    self.checkpoint["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    self.checkpoint["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    self.checkpoint["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    self.checkpoint["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    self.checkpoint["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    self.checkpoint["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter)
+                print(
+                    f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                # time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        # if config.save_model:
+        #     # Persist replay buffer to disk
+        #     path = config.results_path / "replay_buffer.pkl"
+        #     print(f"\n\nPersisting replay buffer games to disk at {path}")
+        #     pickle.dump(
+        #         {
+        #             "buffer": buffer,
+        #             "num_played_games": checkpoint["num_played_games"],
+        #             "num_played_steps": checkpoint["num_played_steps"],
+        #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
+        #         },
+        #         open(path, "wb"),
+        #     )
+
+    def update_gameplay_checkpoint(self, game_history):
+        self.checkpoint["episode_length"] = len(game_history.action_history) - 1
+        self.checkpoint["total_reward"] = sum(game_history.reward_history)
+        self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
+
+        if 1 < len(self.config.players):
+            self.checkpoint["muzero_reward"] = sum(
+                        reward
+                        for i, reward in enumerate(game_history.reward_history)
+                        if game_history.to_play_history[i - 1]
+                        == self.config.muzero_player
+                    )
+            self.checkpoint["opponent_reward"] = sum(
+                        reward
+                        for i, reward in enumerate(game_history.reward_history)
+                        if game_history.to_play_history[i - 1]
+                        != self.config.muzero_player
+                    )
+
+    def save_checkpoint(self, path=None): #将模型存储在文件中
+        if not path:
+            path = self.config.results_path / "model.checkpoint"
+
+        torch.save(self.checkpoint, path)
+
+    def train(self, log_in_tensorboard=True):
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+
+        trainer = Trainer(self.model_cls, self.checkpoint, self.config)
+        game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed)
+        buffer = {}
+        play_buffer = PlayBuffer(self.checkpoint, buffer, self.config)
+
+        step = 1 # 间隔，即每次模拟后训练多少次
+        max_steps = int(self.config.training_steps/step)
+        # max_steps = 2000
+
+        writer = SummaryWriter(self.config.results_path)
+
+        for episode in range(max_steps):
+            game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0)
+
+            # print(game_id)
+            # print(game_history.action_history)
+            # print(game_history.reward_history)
+            # print(game_history.to_play_history)
+            # # print(game_history.observation_history)
+            # print("child visits", game_history.child_visits)
+            # print(game_history.root_values) # root value指的是root节点的UCB值
+
+            play_buffer.update_game_history(game_id, game_history)
+            self.update_gameplay_checkpoint( game_history)
+
+            for i in range(step):
+                index_batch, batch = play_buffer.get_batch()
+                # print(batch[1])
+                trainer.update_lr()
+                (
+                    priorities,
+                    total_loss,
+                    value_loss,
+                    reward_loss,
+                    policy_loss,
+                ) = trainer.update_weights(batch)
+
+
+                training_step = episode * step + i
+                if training_step % self.config.checkpoint_interval == 0:
+                    self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
+                    self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
+
+                    if self.config.save_model:
+                        self.save_checkpoint()
+                self.checkpoint["training_step"] = training_step
+                self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
+                self.checkpoint["total_loss"] = total_loss
+                self.checkpoint["value_loss"] = value_loss
+                self.checkpoint["reward_loss"] = reward_loss
+                self.checkpoint["policy_loss"] = policy_loss
+
+            # print(training_step)
+            # if training_step % 500 == 0:
+            # if training_step % config.checkpoint_interval == 0:
+            #     # print(training_step)
+            #     logging_loop(config, checkpoint, writer)
+
+            self.logging_loop(writer, training_step)
+
+
+        writer.close()
+
+        game_play.close_game()
+
+# if __name__ == "__main__":
+#     # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+#     # start_time = time.time()
+#     # muzero.train()
+#     # end_time = time.time()
+#     # print("耗时: {:.2f}秒".format(end_time - start_time))
+#     model_cls = models.MuZeroNetwork
+#     if len(sys.argv) == 2:
+#         # Train directly with: python muzero.py cartpole
+#         muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
+#         muzero.train()
+#     elif len(sys.argv) == 3:
+#         # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+#         config = json.loads(sys.argv[2])
+#         muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
+#         muzero.train()
+#     else:
+#         print("\nWelcome to MuZero! Here's a list of games:")
+#         # Let user pick a game
+#         games = [
+#             filename.stem
+#             for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+#             if filename.name != "abstract_game.py"
+#         ]
+#         for i in range(len(games)):
+#             print(f"{i}. {games[i]}")
+#         choice = input("Enter a number to choose the game: ")
+#         valid_inputs = [str(i) for i in range(len(games))]
+#         while choice not in valid_inputs:
+#             choice = input("Invalid input, enter a number listed above: ")
+#
+#         # Initialize MuZero
+#         choice = int(choice)
+#         game_name = games[choice]
+#         muzero = MuZeroGeneral(game_name, model_cls=model_cls)
+#
+#         while True:
+#             # Configure running options
+#             options = [
+#                 "Train",
+#                 "Load pretrained model",
+#                 "Diagnose model",
+#                 "Render some self play games",
+#                 "Play against MuZero",
+#                 "Test the game manually",
+#                 "Hyperparameter search",
+#                 "Exit",
+#             ]
+#             print()
+#             for i in range(len(options)):
+#                 print(f"{i}. {options[i]}")
+#
+#             choice = input("Enter a number to choose an action: ")
+#             valid_inputs = [str(i) for i in range(len(options))]
+#             while choice not in valid_inputs:
+#                 choice = input("Invalid input, enter a number listed above: ")
+#             choice = int(choice)
+#             if choice == 0:
+#                 start_time = time.time()
+#                 muzero.train()
+#                 end_time = time.time()
+#                 print("耗时: {:.2f}秒".format(end_time - start_time))
+#             elif choice == 1:
+#                 load_model_menu(muzero, game_name)
+#             elif choice == 2:
+#                 muzero.diagnose_model(30)
+#             elif choice == 3:
+#                 muzero.test(render=True, opponent="self", muzero_player=None)
+#             elif choice == 4:
+#                 muzero.test(render=True, opponent="human", muzero_player=0)
+#             elif choice == 5:
+#                 env = muzero.Game()
+#                 env.reset()
+#                 env.render()
+#
+#                 done = False
+#                 while not done:
+#                     action = env.human_to_action()
+#                     observation, reward, done = env.step(action)
+#                     print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+#                     env.render()
+#             elif choice == 6:
+#                 # Define here the parameters to tune
+#                 # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+#                 muzero.terminate_workers()
+#                 del muzero
+#                 budget = 20
+#                 parallel_experiments = 2
+#                 lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+#                 discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+#                 parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+#                 best_hyperparameters = hyperparameter_search(
+#                     game_name, parametrization, budget, parallel_experiments, 20
+#                 )
+#                 muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
+#             else:
+#                 break
+#             print("\nDone")
diff --git a/muzero_without_replay_buffer2.py b/muzero_without_replay_buffer2.py
index ebbb147f..4b87fc7b 100644
--- a/muzero_without_replay_buffer2.py
+++ b/muzero_without_replay_buffer2.py
@@ -1,321 +1,12 @@
-import pathlib
-import importlib
-import ray
-
-import numpy
-import torch
-from torch.utils.tensorboard import SummaryWriter
-import pickle
+import models
+from muzero_general import MuZeroGeneral
+from muzero import load_model_menu, hyperparameter_search
 
-import math
+import json
+import sys
+import pathlib
 import time
-import copy
 import nevergrad
-import sys
-import json
-
-from simplifiedMuZero.without_rb.game_play import GamePlay
-from simplifiedMuZero.without_rb.play_buffer import PlayBuffer
-from simplifiedMuZero.without_rb.trainer import Trainer
-from muzero import load_model_menu, hyperparameter_search
-
-import models
-
-
-class CPUActorWithClass:
-    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
-    def __init__(self):
-        pass
-
-    def get_initial_weights(self, config, model_cls):
-        model = model_cls(config)
-        weigths = model.get_weights()
-        summary = str(model).replace("\n", " \n\n")
-        return weigths, summary
-
-class MuZeroWithoutRB:
-    def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None):
-        # Load the game and the config from the module with the game name
-        try:
-            game_module = importlib.import_module("games." + game_name)
-            print("games." + game_name)
-            self.Game = game_module.Game
-            self.config = game_module.MuZeroConfig()
-            if save_path_ex:
-                config.results_path /= save_path_ex
-        except ModuleNotFoundError as err:
-            print(
-                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
-            )
-            raise err
-
-        self.model_cls = model_cls
-
-        # Overwrite the config
-        if config:
-            if type(config) is dict:
-                for param, value in config.items():
-                    if hasattr(self.config, param):
-                        setattr(self.config, param, value)
-                    else:
-                        raise AttributeError(
-                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
-                        )
-            else:
-                self.config = config
-
-        # Fix random generator seed
-        numpy.random.seed(self.config.seed)
-        torch.manual_seed(self.config.seed)
-
-        # Manage GPUs
-        if self.config.max_num_gpus == 0 and (
-            self.config.selfplay_on_gpu
-            or self.config.train_on_gpu
-            or self.config.reanalyse_on_gpu
-        ):
-            raise ValueError(
-                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
-            )
-        if (
-            self.config.selfplay_on_gpu
-            or self.config.train_on_gpu
-            or self.config.reanalyse_on_gpu
-        ):
-            total_gpus = (
-                self.config.max_num_gpus
-                if self.config.max_num_gpus is not None
-                else torch.cuda.device_count()
-            )
-        else:
-            total_gpus = 0
-        self.num_gpus = total_gpus / split_resources_in
-        if 1 < self.num_gpus:
-            self.num_gpus = math.floor(self.num_gpus)
-
-        # Checkpoint and replay buffer used to initialize workers
-        self.checkpoint = {
-            "weights": None,
-            "optimizer_state": None,
-            "total_reward": 0,
-            "muzero_reward": 0,
-            "opponent_reward": 0,
-            "episode_length": 0,
-            "mean_value": 0,
-            "training_step": 0,
-            "lr": 0,
-            "total_loss": 0,
-            "value_loss": 0,
-            "reward_loss": 0,
-            "policy_loss": 0,
-            "num_played_games": 0,
-            "num_played_steps": 0,
-            "num_reanalysed_games": 0,
-            "terminate": False,
-        }
-        self.replay_buffer = {}
-
-        cpu_actor = CPUActorWithClass()
-        cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls)
-        self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights))
-
-
-    def logging_loop(self, writer, training_steps):
-        # writer = SummaryWriter(config.results_path)
-
-        # print(
-        #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
-        # )
-
-        # Save hyperparameters to TensorBoard
-        hp_table = [
-            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
-        ]
-        writer.add_text(
-            "Hyperparameters",
-            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
-        )
-        # # Save model representation
-        # writer.add_text(
-        #     "Model summary",
-        #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
-        # )
-        # Loop for updating the training performance
-        counter = training_steps
-
-        try:
-            if True:
-            # while checkpoint["training_step"] < config.training_steps:
-                writer.add_scalar(
-                    "1.Total_reward/1.Total_reward",
-                    self.checkpoint["total_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/2.Mean_value",
-                    self.checkpoint["mean_value"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/3.Episode_length",
-                    self.checkpoint["episode_length"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/4.MuZero_reward",
-                    self.checkpoint["muzero_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "1.Total_reward/5.Opponent_reward",
-                    self.checkpoint["opponent_reward"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/1.Self_played_games",
-                    self.checkpoint["num_played_games"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter
-                )
-                writer.add_scalar(
-                    "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter
-                )
-                writer.add_scalar(
-                    "2.Workers/4.Reanalysed_games",
-                    self.checkpoint["num_reanalysed_games"],
-                    counter,
-                )
-                writer.add_scalar(
-                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
-                    self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]),
-                    counter,
-                )
-                writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter)
-                writer.add_scalar(
-                    "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter
-                )
-                writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter)
-                writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter)
-                writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter)
-                print(
-                    f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}',
-                    end="\r",
-                )
-                counter += 1
-                # time.sleep(0.5)
-        except KeyboardInterrupt:
-            pass
-
-        # if config.save_model:
-        #     # Persist replay buffer to disk
-        #     path = config.results_path / "replay_buffer.pkl"
-        #     print(f"\n\nPersisting replay buffer games to disk at {path}")
-        #     pickle.dump(
-        #         {
-        #             "buffer": buffer,
-        #             "num_played_games": checkpoint["num_played_games"],
-        #             "num_played_steps": checkpoint["num_played_steps"],
-        #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
-        #         },
-        #         open(path, "wb"),
-        #     )
-
-    def update_gameplay_checkpoint(self, game_history):
-        self.checkpoint["episode_length"] = len(game_history.action_history) - 1
-        self.checkpoint["total_reward"] = sum(game_history.reward_history)
-        self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
-
-        if 1 < len(self.config.players):
-            self.checkpoint["muzero_reward"] = sum(
-                        reward
-                        for i, reward in enumerate(game_history.reward_history)
-                        if game_history.to_play_history[i - 1]
-                        == self.config.muzero_player
-                    )
-            self.checkpoint["opponent_reward"] = sum(
-                        reward
-                        for i, reward in enumerate(game_history.reward_history)
-                        if game_history.to_play_history[i - 1]
-                        != self.config.muzero_player
-                    )
-
-    def save_checkpoint(self, path=None): #将模型存储在文件中
-        if not path:
-            path = self.config.results_path / "model.checkpoint"
-
-        torch.save(self.checkpoint, path)
-
-    def train(self, log_in_tensorboard=True):
-        if log_in_tensorboard or self.config.save_model:
-            self.config.results_path.mkdir(parents=True, exist_ok=True)
-
-
-        trainer = Trainer(models.MuZeroNetwork, self.checkpoint, self.config)
-        game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed)
-        buffer = {}
-        play_buffer = PlayBuffer(self.checkpoint, buffer, self.config)
-
-        step = 1 # 间隔，即每次模拟后训练多少次
-        max_steps = int(self.config.training_steps/step)
-        # max_steps = 2000
-
-        writer = SummaryWriter(self.config.results_path)
-
-        for episode in range(max_steps):
-            game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0)
-
-            # print(game_id)
-            # print(game_history.action_history)
-            # print(game_history.reward_history)
-            # print(game_history.to_play_history)
-            # # print(game_history.observation_history)
-            # print("child visits", game_history.child_visits)
-            # print(game_history.root_values) # root value指的是root节点的UCB值
-
-            play_buffer.update_game_history(game_id, game_history)
-            self.update_gameplay_checkpoint( game_history)
-
-            for i in range(step):
-                index_batch, batch = play_buffer.get_batch()
-                # print(batch[1])
-                trainer.update_lr()
-                (
-                    priorities,
-                    total_loss,
-                    value_loss,
-                    reward_loss,
-                    policy_loss,
-                ) = trainer.update_weights(batch)
-
-
-                training_step = episode * step + i
-                if training_step % self.config.checkpoint_interval == 0:
-                    self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
-                    self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
-
-                    if self.config.save_model:
-                        self.save_checkpoint()
-                self.checkpoint["training_step"] = training_step
-                self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
-                self.checkpoint["total_loss"] = total_loss
-                self.checkpoint["value_loss"] = value_loss
-                self.checkpoint["reward_loss"] = reward_loss
-                self.checkpoint["policy_loss"] = policy_loss
-
-            # print(training_step)
-            # if training_step % 500 == 0:
-            # if training_step % config.checkpoint_interval == 0:
-            #     # print(training_step)
-            #     logging_loop(config, checkpoint, writer)
-
-            self.logging_loop(writer, training_step)
-
-
-        writer.close()
-
-        game_play.close_game()
 
 if __name__ == "__main__":
     # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
@@ -326,12 +17,12 @@ def train(self, log_in_tensorboard=True):
     model_cls = models.MuZeroNetwork
     if len(sys.argv) == 2:
         # Train directly with: python muzero.py cartpole
-        muzero = MuZeroWithoutRB(sys.argv[1], model_cls=model_cls)
+        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
         muzero.train()
     elif len(sys.argv) == 3:
         # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
         config = json.loads(sys.argv[2])
-        muzero = MuZeroWithoutRB(sys.argv[1], config, model_cls=model_cls)
+        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
         muzero.train()
     else:
         print("\nWelcome to MuZero! Here's a list of games:")
@@ -351,7 +42,7 @@ def train(self, log_in_tensorboard=True):
         # Initialize MuZero
         choice = int(choice)
         game_name = games[choice]
-        muzero = MuZeroWithoutRB(game_name, model_cls=model_cls)
+        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
 
         while True:
             # Configure running options
@@ -411,7 +102,7 @@ def train(self, log_in_tensorboard=True):
                 best_hyperparameters = hyperparameter_search(
                     game_name, parametrization, budget, parallel_experiments, 20
                 )
-                muzero = MuZeroWithoutRB(game_name, best_hyperparameters , model_cls=model_cls)
+                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
             else:
                 break
-            print("\nDone")
+            print("\nDone")
\ No newline at end of file
diff --git a/simplifiedMuZero/models2.py b/simplifiedMuZero/models2.py
index 4fb55bad..fd6aa6ee 100644
--- a/simplifiedMuZero/models2.py
+++ b/simplifiedMuZero/models2.py
@@ -3,12 +3,13 @@
 
 import torch
 
-from models import *
+from models import support_to_scalar, scalar_to_support, mlp, AbstractNetwork, conv3x3, RepresentationNetwork, DynamicsNetwork, PredictionNetwork
 
-class SimplifiedMuZeroNetwork:
+class MuZeroNetwork_2net:
     def __new__(cls, config):
+        print("MuZeroNetwork_2net")
         if config.network == "fullyconnected":
-            return SimplifiedMuZeroFullyConnectedNetwork(
+            return MuZeroFullyConnectedNetwork_2net(
                 config.observation_shape,
                 config.stacked_observations,
                 len(config.action_space),
@@ -21,7 +22,8 @@ def __new__(cls, config):
                 config.support_size,
             )
         elif config.network == "resnet":
-            return MuZeroResidualNetwork(
+            print("resnet")
+            return MuZeroResidualNetwork_2net(
                 config.observation_shape,
                 config.stacked_observations,
                 len(config.action_space),
@@ -40,64 +42,70 @@ def __new__(cls, config):
             raise NotImplementedError(
                 'The network parameter should be "fullyconnected" or "resnet".'
             )
-class SimplifiedMuZeroFullyConnectedNetwork(AbstractNetwork):
-    def __init__(self,
-                 observation_shape,
-                 stacked_observations,
-                 action_space_size,
-                 encoding_size,
-                 fc_reward_layers,
-                 fc_value_layers,
-                 fc_policy_layers,
-                 fc_representation_layers,
-                 fc_dynamics_layers,
-                 support_size,
-                 ):
+class MuZeroFullyConnectedNetwork_2net(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        encoding_size,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        fc_representation_layers,
+        fc_dynamics_layers,
+        support_size,
+    ):
         super().__init__()
-        # 动作空间大小
         self.action_space_size = action_space_size
-        #为什么是2*support_size +1
         self.full_support_size = 2 * support_size + 1
-        representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * (stacked_observations + 1)\
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
+
+        representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * (
+                    stacked_observations + 1) \
                                     + stacked_observations * observation_shape[1] * observation_shape[2]
 
-        # 改进方法：
-        #   1. input size = encoding _size
-        #   2. input 后边加上 action space
-        self.representation_network = torch.nn.DataParallel(
-            mlp(
-                representation_input_size,
-                fc_representation_layers,
-                encoding_size
-            )
-        )
+        # 输出等于输入，即编码维度等于输入维度
+        encoding_size = representation_input_size
+
+        # self.representation_network = torch.nn.DataParallel(
+        #     # mlp(
+        #     #     representation_input_size,
+        #     #     fc_representation_layers,
+        #     #     encoding_size,
+        #     # )
+        #     mlp(
+        #         representation_input_size + self.action_space_size,
+        #         fc_representation_layers,
+        #         encoding_size,
+        #     )
+        # )
 
-        self.dynamic_encoded_state_network = torch.nn.DataParallel(
+        #dynamics的输入是encoding_size+action_space_size
+        self.dynamics_encoded_state_network = torch.nn.DataParallel(
             mlp(
-                encoding_size +self.action_space_size,
+                encoding_size + self.action_space_size,
                 fc_dynamics_layers,
-                encoding_size
+                encoding_size,
             )
         )
-
         self.dynamics_reward_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_reward_layers, self.full_support_size)
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
-        self.prediction_polic_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_policy_layers, self.action_space_size)
+        self.prediction_policy_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
         )
-
         self.prediction_value_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_value_layers, self.full_support_size)
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
-    def prediction(self, encode_state):
-        policy_logits = self.prediction_polic_network(encode_state)
-        value = self.prediction_value_network(encode_state)
+    def prediction(self, encoded_state):
+        policy_logits = self.prediction_policy_network(encoded_state)
+        value = self.prediction_value_network(encoded_state)
         return policy_logits, value
 
-    # 将encoded_stated标准化
+        # 将encoded_stated标准化
     def encoded_stated_normalized(self, encoded_state):
         min_encoded_state = encoded_state.min(1, keepdim=True)[0]
         max_encoded_state = encoded_state.max(1, keepdim=True)[0]
@@ -106,11 +114,17 @@ def encoded_stated_normalized(self, encoded_state):
         encoded_state_normalized = (encoded_state - min_encoded_state) / scale_encoded_state
 
         return encoded_state_normalized
-
     def representation(self, observation):
-        encoded_state = self.representation_network(
-            observation.view(observation.shape[0], -1)
-        )
+        observation = observation.view(observation.shape[0], -1)
+        action_zeros = (torch.zeros((observation.shape[0], self.action_space_size)).to(observation.device).float())
+        x = torch.cat((observation, action_zeros), dim=1)
+
+        # encoded_state = self.representation_network(x)
+        encoded_state = self.dynamics_encoded_state_network(x)
+
+        # encoded_state = self.representation_network(
+        #     observation.view(observation.shape[0], -1)
+        # )
 
         return self.encoded_stated_normalized(encoded_state)
 
@@ -120,10 +134,9 @@ def dynamics(self, encoded_state, action):
         action_one_hot.scatter(1, action.long(), 1.0)
         x = torch.cat((encoded_state, action_one_hot), dim=1)
 
-        next_encoded_state = self.dynamic_encoded_state_network(x)
+        next_encoded_state = self.dynamics_encoded_state_network(x)
 
         reward = self.dynamics_reward_network(next_encoded_state)
-
         next_encoded_state_normalized = self.encoded_stated_normalized(next_encoded_state)
 
         return next_encoded_state_normalized, reward
@@ -131,8 +144,7 @@ def dynamics(self, encoded_state, action):
     def initial_inference(self, observation):
         encoded_state = self.representation(observation)
         policy_logits, value = self.prediction(encoded_state)
-
-        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
+        # reward equal to 0 for consistency 一致性奖励等于 0
         reward = torch.log(
             (
                 torch.zeros(1, self.full_support_size)
@@ -141,8 +153,258 @@ def initial_inference(self, observation):
                 .to(observation.device)
             )
         )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
+
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+class MuZeroResidualNetwork_2net(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations, # stacken_observations表示先去观察的数量，用在那些需要历史信息的游戏里。如果不需要历史观察，该值为0
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        support_size,
+        downsample,
+    ):
+        super().__init__()
+        print("observation shape is ", observation_shape)
+        print("num channels is ", num_channels)
+
+        num_channels = observation_shape[1]
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        block_output_size_reward = (
+            (
+                reduced_channels_reward
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_reward * observation_shape[1] * observation_shape[2])
+        )
+
+        # observations_shape存放的时观察值的维度形状，第0维时观察的当前和历史维度，后面几维是观察值
+        block_output_size_value = (
+            (
+                reduced_channels_value
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_value * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_policy = (
+            (
+                reduced_channels_policy
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_policy * observation_shape[1] * observation_shape[2])
+        )
+
+        # self.representation_network = torch.nn.DataParallel(
+        #     RepresentationNetwork(
+        #         observation_shape,
+        #         stacked_observations,
+        #         num_blocks,
+        #         num_channels,
+        #         downsample,
+        #     )
+        # )
+
+        self.dynamics_network = torch.nn.DataParallel(
+            DynamicsNetwork(
+                num_blocks,
+                num_channels + 1,
+                reduced_channels_reward,
+                fc_reward_layers,
+                self.full_support_size,
+                block_output_size_reward,
+            )
+        )
+
+        self.prediction_network = torch.nn.DataParallel(
+            PredictionNetwork(
+                action_space_size,
+                num_blocks,
+                num_channels,
+                reduced_channels_value,
+                reduced_channels_policy,
+                fc_value_layers,
+                fc_policy_layers,
+                self.full_support_size,
+                block_output_size_value,
+                block_output_size_policy,
+            )
+        )
+
+    def prediction(self, encoded_state):
+        # print("encoded_state shape is : " , encoded_state.shape)
+        policy, value = self.prediction_network(encoded_state)
+        return policy, value
+
+    # def representation(self, observation):
+    #     # print("observation shape is : ", observation.shape)
+    #     encoded_state = self.representation_network(observation)
+    #
+    #     # Scale encoded state between [0, 1] (See appendix paper Training)
+    #     min_encoded_state = (
+    #         encoded_state.view(
+    #             -1,
+    #             encoded_state.shape[1],
+    #             encoded_state.shape[2] * encoded_state.shape[3],
+    #         )
+    #         .min(2, keepdim=True)[0]
+    #         .unsqueeze(-1)
+    #     )
+    #     max_encoded_state = (
+    #         encoded_state.view(
+    #             -1,
+    #             encoded_state.shape[1],
+    #             encoded_state.shape[2] * encoded_state.shape[3],
+    #         )
+    #         .max(2, keepdim=True)[0]
+    #         .unsqueeze(-1)
+    #     )
+    #     scale_encoded_state = max_encoded_state - min_encoded_state
+    #     scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+    #     encoded_state_normalized = (
+    #         encoded_state - min_encoded_state
+    #     ) / scale_encoded_state
+    #     return encoded_state_normalized
+
+    def representation(self, encoded_state):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.ones(
+                (
+                    encoded_state.shape[0],
+                    1,
+                    encoded_state.shape[2],
+                    encoded_state.shape[3],
+                )
+            )
+            .to(encoded_state.device)
+            .float()
+        )
+        # action_one_hot = (
+        #         action[:, :, None, None] * action_one_hot / self.action_space_size
+        # )
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+        next_encoded_state, _ = self.dynamics_network(x) # 第二个参数是reward，在表示网络不需要它
+
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        next_encoded_state_normalized = (
+                                                next_encoded_state - min_next_encoded_state
+                                        ) / scale_next_encoded_state
+        return next_encoded_state_normalized
 
-        return (value, reward, policy_logits, encoded_state)
+    def dynamics(self, encoded_state, action):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.ones(
+                (
+                    encoded_state.shape[0],
+                    1,
+                    encoded_state.shape[2],
+                    encoded_state.shape[3],
+                )
+            )
+            .to(action.device)
+            .float()
+        )
+        action_one_hot = (
+            action[:, :, None, None] * action_one_hot / self.action_space_size
+        )
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+        next_encoded_state, reward = self.dynamics_network(x)
+
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        next_encoded_state_normalized = (
+            next_encoded_state - min_next_encoded_state
+        ) / scale_next_encoded_state
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        # action = torch.tensor([[0]]).to(observation.device)
+        # encoded_state = self.dynamics(observation, action)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
+                .to(observation.device)
+            )
+        )
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
 
     def recurrent_inference(self, encoded_state, action):
         next_encoded_state, reward = self.dynamics(encoded_state, action)
diff --git a/simplified_muzero.py b/simplified_muzero.py
new file mode 100644
index 00000000..cd99153e
--- /dev/null
+++ b/simplified_muzero.py
@@ -0,0 +1,108 @@
+from simplifiedMuZero.net2.models_2net import SimplifiedMuZeroNetwork
+from muzero_general import MuZeroGeneral
+from muzero import load_model_menu, hyperparameter_search
+
+import json
+import sys
+import pathlib
+import time
+import nevergrad
+
+if __name__ == "__main__":
+    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+    # start_time = time.time()
+    # muzero.train()
+    # end_time = time.time()
+    # print("耗时: {:.2f}秒".format(end_time - start_time))
+    model_cls = SimplifiedMuZeroNetwork
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
+            else:
+                break
+            print("\nDone")
\ No newline at end of file
diff --git a/simplified_muzero2.py b/simplified_muzero2.py
new file mode 100644
index 00000000..a136dd44
--- /dev/null
+++ b/simplified_muzero2.py
@@ -0,0 +1,108 @@
+from simplifiedMuZero.models2 import MuZeroNetwork_2net
+from muzero_general import MuZeroGeneral
+from muzero import load_model_menu, hyperparameter_search
+
+import json
+import sys
+import pathlib
+import time
+import nevergrad
+
+if __name__ == "__main__":
+    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+    # start_time = time.time()
+    # muzero.train()
+    # end_time = time.time()
+    # print("耗时: {:.2f}秒".format(end_time - start_time))
+    model_cls = MuZeroNetwork_2net
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
+            else:
+                break
+            print("\nDone")
\ No newline at end of file
diff --git a/test/deap_test.py b/test/deap_test.py
new file mode 100644
index 00000000..0ec02e8e
--- /dev/null
+++ b/test/deap_test.py
@@ -0,0 +1,44 @@
+import random
+
+import deap
+from games.tictactoe import Game, MuZeroConfig
+import numpy as np
+
+config = MuZeroConfig()
+print(config.max_moves)
+
+from deap import base, creator, tools
+import numpy as np
+# 定义问题
+creator.create('FitnessMax', base.Fitness, weights=(-1.0,)) #优化目标：单变量，求最小值
+creator.create('Individual', list, fitness = creator.FitnessMax) #创建Individual类，继承list
+
+legal_actions = 9
+
+toolbox = base.Toolbox()
+toolbox.register("Indices", random.sample, range(legal_actions), legal_actions)
+toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Indices)
+
+ind1 = toolbox.Individual()
+print(ind1)
+
+toolbox.register("population", tools.initRepeat, list, toolbox.Individual)
+
+pop = toolbox.population(n=36)
+print(len(pop))
+
+def ea(game):
+    pass
+
+# game = Game(0)
+# game.reset()
+#
+# for i in range(9):
+#     game.render()
+#     print(game.legal_actions())
+#     observation, reward, done = game.step(np.random.choice(game.legal_actions()))
+#
+#     if done:
+#         break
+#
+# game.render()

From 885308edef158a45c55ae2695de9f06a80a63c2b Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Tue, 22 Aug 2023 20:14:38 +0100
Subject: [PATCH 6/9] parameter optimization

---
 game_tournament.py                            | 252 ++++-
 games/simple_grid.py                          |   2 +
 games/tictactoe.py                            |   3 +-
 muzero_2net.py                                |   8 +-
 muzero_no_pv.py                               | 716 +++++++++++++
 muzero_rhea.py                                | 719 +++++++++++++
 muzero_uniform.py                             |   4 +-
 muzero_without_replay_buffer.py               | 964 ++----------------
 muzero_without_replay_buffer2.py              | 108 --
 simplifiedMuZero/net2/__init__.py             |   0
 simplifiedMuZero/{ => net2}/models2.py        |   9 +-
 simplifiedMuZero/net2/replay_buffer_2net.py   |   7 +-
 simplifiedMuZero/net2/self_play_2net.py       |   6 +-
 simplifiedMuZero/net2/trainer_2net.py         |   6 +-
 simplifiedMuZero/no_pv/trainer_no_pv.py       | 301 ++++++
 simplifiedMuZero/search_policy/RHEA.py        |  83 +-
 simplifiedMuZero/search_policy/RHEA2.py       | 192 ++++
 .../search_policy/rhea_self_play.py           | 227 +++++
 simplified_muzero.py                          |   4 +-
 simplified_muzero2.py                         | 108 --
 test/deap_test.py                             | 108 +-
 test/deap_test2.py                            | 119 +++
 test/load_model.py                            |  12 +
 23 files changed, 2792 insertions(+), 1166 deletions(-)
 create mode 100644 muzero_no_pv.py
 create mode 100644 muzero_rhea.py
 delete mode 100644 muzero_without_replay_buffer2.py
 create mode 100644 simplifiedMuZero/net2/__init__.py
 rename simplifiedMuZero/{ => net2}/models2.py (98%)
 create mode 100644 simplifiedMuZero/no_pv/trainer_no_pv.py
 create mode 100644 simplifiedMuZero/search_policy/RHEA2.py
 create mode 100644 simplifiedMuZero/search_policy/rhea_self_play.py
 delete mode 100644 simplified_muzero2.py
 create mode 100644 test/deap_test2.py
 create mode 100644 test/load_model.py

diff --git a/game_tournament.py b/game_tournament.py
index 918beac3..9e8499e5 100644
--- a/game_tournament.py
+++ b/game_tournament.py
@@ -6,8 +6,8 @@
 
 from games.tictactoe import MuZeroConfig, Game
 import models
+import simplifiedMuZero.net2.models2 as models2
 from self_play import MCTS, GameHistory,SelfPlay
-from simplifiedMuZero.search_policy.self_play_uniform_search import UniformSearch
 
 class GameTournament:
     def __init__(self, config:MuZeroConfig):
@@ -107,6 +107,73 @@ def play_competition(self, model1, search_policy1, model2, search_policy2):
         # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
         return self.game.env.have_winner(), is_model1 == (reward > 0)
 
+    def play_with_expert(self, model, search_policy, expert_first=True):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model.eval()
+
+        is_model = not expert_first
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+
+            if is_model:
+                root, mcts_info = search_policy(self.config).run(
+                    model,
+                    stacked_observations,
+                    self.game.legal_actions(),
+                    self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                    True,
+                )
+                action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            else:
+                action = self.game.expert_agent()
+                root = None
+
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model = not is_model
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model == (reward > 0)
+
     def close_game(self):
         self.game.close()
 
@@ -124,7 +191,7 @@ def play_tournament(self, models, rollnum=1000):
                 no_winner_num = 0
 
                 for _ in range(rollnum):
-                    have_winner, is_model1 = game_tournament.play_competition(model1, MCTS, model2, MCTS)
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
 
                     if have_winner:
                         if is_model1:
@@ -134,30 +201,133 @@ def play_tournament(self, models, rollnum=1000):
                     else:
                         no_winner_num += 1
 
-                 # 交换顺序，再来一遍
+                #  # 交换顺序，再来一遍
+                # for _ in range(rollnum):
+                #     have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS)
+                #
+                #     if have_winner:
+                #         if is_model1:
+                #             model2_win_num += 1
+                #         else:
+                #             model1_win_num += 1
+                #     else:
+                #         no_winner_num += 1
+
+                # print(is_model1)
+
+                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
+
+                print(models[i]["name"], " win  :   ", model1_win_num)
+                print(models[j]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+        model1_win_num = 0
+        model2_win_num = 0
+        no_winner_num = 0
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
                 for _ in range(rollnum):
-                    have_winner, is_model1 = game_tournament.play_competition(model2, MCTS, model1, MCTS)
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
 
                     if have_winner:
                         if is_model1:
-                            model2_win_num += 1
-                        else:
                             model1_win_num += 1
+                        else:
+                            model2_win_num += 1
                     else:
                         no_winner_num += 1
 
-                # print(is_model1)
 
-                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
+                print(models[j]["name"],"   ,", models[i]["name"]," :   ")
 
-                print(models[i]["name"], " win  :   ", model1_win_num)
-                print(models[j]["name"], " win  :   ", model2_win_num)
+                print(models[j]["name"], " win  :   ", model1_win_num)
+                print(models[i]["name"], " win  :   ", model2_win_num)
                 print("No Winner", no_winner_num)
                 print("===================================")
 
+    def play_tournament_with_expert(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            model = models[i]["model"]
+
+            # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+
+            for _ in range(rollnum):
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+
+            print(models[i]["name"], "   ,", "expert :   ")
+
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("expert win  :   ", expert_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
+
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+            for _ in range(rollnum):
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+            print("expert :   ", "   ,", models[i]["name"])
+
+            print("expert win  :   ", expert_win_num)
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
 
 
-def load_model(model_cls, model_path):
+
+def load_model(model_cls, model_path, config):
     checkpoint = torch.load(model_path)
     model = model_cls(config)
     model.set_weights(checkpoint["weights"])
@@ -168,17 +338,32 @@ def load_model(model_cls, model_path):
 if __name__ == "__main__":
     config = MuZeroConfig()
 
-    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
-    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1)
+    # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--09-40-26\model.checkpoint"
+    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config)
+
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
 
-    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path)
+    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config)
 
     uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
-    uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path)
+    uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
 
     without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
-    without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path)
+    without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
+
+    muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
+
+
+    simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
+
+    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
+    # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
+
 
     game_tournament = GameTournament(config)
 
@@ -187,35 +372,14 @@ def load_model(model_cls, model_path):
         {"name":"uniform", "model":uniform_model},
         {"name":"muzero", "model":muzero_model},
         {"name": "without_rb", "model": without_rb_model},
+        {"name": "no policy value", "model": muzero_no_policy_model},
+        {"name": "simplified_muzero", "model": without_rb_model},
     ]
 
-    # rollnum = 1000
-    #
-    # # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
-    # model1_win_num = 0
-    # model2_win_num = 0
-    # no_winner_num = 0
-    #
-    # for i in range(rollnum):
-    #     have_winner, is_model1 = game_tournament.play_competition(muzero_2net_model, MCTS, uniform_model, MCTS)
-    #
-    #     if have_winner:
-    #         if is_model1:
-    #             model1_win_num += 1
-    #         else:
-    #             model2_win_num += 1
-    #     else:
-    #         no_winner_num += 1
-    #
-    # # print(is_model1)
-    #
-    # print(model1_win_num)
-    # print(model2_win_num)
-    # print(no_winner_num)
-
-    game_tournament.play_tournament(models, rollnum=100)
 
-    game_tournament.close_game()
+    # game_tournament.play_tournament(models, rollnum=1000)
+    game_tournament.play_tournament(models, rollnum=10)
+    game_tournament.play_tournament_with_expert(models, rollnum=100)
 
+    game_tournament.close_game()
 
-    # print(checkpoint)
diff --git a/games/simple_grid.py b/games/simple_grid.py
index f26ae429..d163d7de 100644
--- a/games/simple_grid.py
+++ b/games/simple_grid.py
@@ -23,6 +23,8 @@ def __init__(self):
         self.players = list(range(1))  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        self.action_replace = True
+
         # Evaluate
         self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
         self.opponent = None  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
diff --git a/games/tictactoe.py b/games/tictactoe.py
index c2529d5d..787986fb 100644
--- a/games/tictactoe.py
+++ b/games/tictactoe.py
@@ -27,7 +27,8 @@ def __init__(self):
         self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
         self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
 
-
+        # 动作是否能重复
+        self.action_replace = False
 
         ### Self-Play
         self.num_workers = 1  # Number of simultaneous threads/workers self-playing to feed the replay buffer
diff --git a/muzero_2net.py b/muzero_2net.py
index 39438acd..642602da 100644
--- a/muzero_2net.py
+++ b/muzero_2net.py
@@ -16,7 +16,9 @@
 sys.path.append("")
 
 import diagnose_model
-import simplifiedMuZero.net2.models_2net as models
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
 import simplifiedMuZero.net2.replay_buffer_2net as replay_buffer
 import simplifiedMuZero.net2.self_play_2net as self_play
 import shared_storage
@@ -69,6 +71,7 @@ def __init__(self, game_name, config=None, split_resources_in=1):
 
         # 重命名路径，以便区分不同的模型
         self.config.results_path /= "muzero_2net"
+        self.config.training_steps = 100000
         # Fix random generator seed
         numpy.random.seed(self.config.seed)
         torch.manual_seed(self.config.seed)
@@ -491,7 +494,8 @@ def __init__(self):
         pass
 
     def get_initial_weights(self, config):
-        model = models.SimplifiedMuZeroNetwork(config)
+        # model = models.SimplifiedMuZeroNetwork(config)
+        model = MuZeroNetwork_2net(config)
         weigths = model.get_weights()
         summary = str(model).replace("\n", " \n\n")
         return weigths, summary
diff --git a/muzero_no_pv.py b/muzero_no_pv.py
new file mode 100644
index 00000000..e94789ed
--- /dev/null
+++ b/muzero_no_pv.py
@@ -0,0 +1,716 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+import models
+import replay_buffer
+import self_play
+import shared_storage
+import simplifiedMuZero.no_pv.trainer_no_pv as trainer
+
+
+class MuZero:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlay.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.MuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_rhea.py b/muzero_rhea.py
new file mode 100644
index 00000000..07ceee18
--- /dev/null
+++ b/muzero_rhea.py
@@ -0,0 +1,719 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+import models
+import replay_buffer
+import simplifiedMuZero.search_policy.rhea_self_play as self_play
+import shared_storage
+import trainer
+
+
+class MuZero_Rhea:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero_Rhea("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # 重命名路径，以便区分不同的模型
+        self.config.results_path /= self.__class__.__name__
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlayRhea.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlayRhea.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlayRhea.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.MuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero_Rhea(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero_Rhea(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero_Rhea):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero_Rhea(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero_Rhea(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero_Rhea(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero_Rhea(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_uniform.py b/muzero_uniform.py
index 24a9e09b..53d4a0b9 100644
--- a/muzero_uniform.py
+++ b/muzero_uniform.py
@@ -16,7 +16,8 @@
 import diagnose_model
 import models
 import replay_buffer
-import simplifiedMuZero.search_policy.self_play_uniform_search as self_play
+import self_play
+# import simplifiedMuZero.search_policy.self_play_uniform_search as self_play
 import shared_storage
 import trainer
 
@@ -67,6 +68,7 @@ def __init__(self, game_name, config=None, split_resources_in=1):
 
         # 重命名路径，以便区分不同的模型
         self.config.results_path /= "muzero_uniform"
+        self.config.temperature_threshold = 0
 
         # Fix random generator seed
         numpy.random.seed(self.config.seed)
diff --git a/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py
index 2eba36a0..4b87fc7b 100644
--- a/muzero_without_replay_buffer.py
+++ b/muzero_without_replay_buffer.py
@@ -1,870 +1,108 @@
-from self_play import MCTS, GameHistory
-from games.simple_grid import MuZeroConfig, Game
-# from games.tictactoe import MuZeroConfig, Game
 import models
+from muzero_general import MuZeroGeneral
+from muzero import load_model_menu, hyperparameter_search
 
-import numpy
-import torch
-from torch.utils.tensorboard import SummaryWriter
-import pickle
-
-import math
+import json
+import sys
+import pathlib
 import time
-import copy
-
-class GamePlay:
-    """
-    Class which run in a dedicated thread to play games and save them to the replay-buffer.
-    """
-
-    def __init__(self, model, initial_checkpoint, Game, config, seed):
-        self.config = config
-        self.game = Game(seed)
-
-        # Fix random generator seed
-        numpy.random.seed(seed)
-        torch.manual_seed(seed)
-
-        # Initialize the network
-        # self.model = models.MuZeroNetwork(self.config)
-        # self.model.set_weights(initial_checkpoint["weights"])
-        self.model = model
-        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
-        self.model.eval()
-        self.trained_steps = initial_checkpoint["training_step"]
-        self.terminate = False
-
-    #play game 运行
-    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
-    # 运行步骤：
-    #   1. 创建GameHistory用来存储数据
-    #   2. 检查游戏是否结束或者到底最大移动次数
-    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
-    #   4. 运行MCTS搜索下一步的action
-    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
-    #   6. 持续运行2-5步直到结束
-    #   7. 返回GameHistory
-    def play_game(
-        self, temperature, temperature_threshold, render, opponent, muzero_player
-    ):
-        """
-        Play one game with actions based on the Monte Carlo tree search at each moves.
-        """
-        game_history = GameHistory()
-        observation = self.game.reset()
-        game_history.action_history.append(0)
-        game_history.observation_history.append(observation) # 添加reset之后的observation
-        game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
-
-        done = False
-        game_id = None
-
-        if render:
-            self.game.render()
-
-        game_id = self.game.to_play()
-
-        with torch.no_grad():
-            while (
-                not done and len(game_history.action_history) <= self.config.max_moves
-            ): # 游戏没有结束且运行步数小于最大移动步长
-                assert (
-                    len(numpy.array(observation).shape) == 3
-                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
-                assert (
-                    numpy.array(observation).shape == self.config.observation_shape
-                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
-                stacked_observations = game_history.get_stacked_observations(
-                    -1, self.config.stacked_observations, len(self.config.action_space)
-                )
-                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
-                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
-
-                # 一下的if-else部分主要是为了选择一个动作
-                # Choose the action
-                if opponent == "self" or muzero_player == self.game.to_play():
-                    root, mcts_info = MCTS(self.config).run(
-                        self.model,
-                        stacked_observations,
-                        self.game.legal_actions(),
-                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
-                        True,
-                    )
-                    action = self.select_action(
-                        root,
-                        temperature
-                        if not temperature_threshold
-                        or len(game_history.action_history) < temperature_threshold
-                        else 0,
-                    ) # 根据temperature选择动作
-
-                    if render:
-                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
-                        print(
-                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
-                        )
-                else:
-                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
-                        opponent, stacked_observations
-                    )
-
-                observation, reward, done = self.game.step(action) # 运行游戏
-
-                if render:
-                    print(f"Played action: {self.game.action_to_string(action)}")
-                    self.game.render()
-
-                game_history.store_search_statistics(root, self.config.action_space)
-
-                # Next batch
-                game_history.action_history.append(action)
-                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
-                game_history.reward_history.append(reward)
-                game_history.to_play_history.append(self.game.to_play())
-
-        return game_id, game_history
-
-    def close_game(self):
-        self.game.close()
-
-    def select_opponent_action(self, opponent, stacked_observations):
-        """
-        Select opponent action for evaluating MuZero level.
-        """
-        if opponent == "human":
-            root, mcts_info = MCTS(self.config).run(
-                self.model,
-                stacked_observations,
-                self.game.legal_actions(),
-                self.game.to_play(),
-                True,
-            )
-            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
-            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
-            print(
-                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
-            )
-            return self.game.human_to_action(), root
-        elif opponent == "expert":
-            return self.game.expert_agent(), None
-        elif opponent == "random":
-            assert (
-                self.game.legal_actions()
-            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
-            assert set(self.game.legal_actions()).issubset(
-                set(self.config.action_space)
-            ), "Legal actions should be a subset of the action space."
-
-            return numpy.random.choice(self.game.legal_actions()), None
-        else:
-            raise NotImplementedError(
-                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
-            )
-
-    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
-    # 公式为 c^(1/t)。可以看到：
-    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
-    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
-    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
-    @staticmethod # 静态方法修饰符，类似于static关键字
-    def select_action(node, temperature):
-        """
-        Select action according to the visit count distribution and the temperature.
-        The temperature is changed dynamically with the visit_softmax_temperature function
-        in the config.
-        """
-        visit_counts = numpy.array(
-            [child.visit_count for child in node.children.values()], dtype="int32"
-        )
-        actions = [action for action in node.children.keys()]
-        if temperature == 0:
-            action = actions[numpy.argmax(visit_counts)]
-        elif temperature == float("inf"):
-            action = numpy.random.choice(actions)
-        else:
-            # See paper appendix Data Generation
-            visit_count_distribution = visit_counts ** (1 / temperature)
-            visit_count_distribution = visit_count_distribution / sum(
-                visit_count_distribution
-            )
-            action = numpy.random.choice(actions, p=visit_count_distribution)
-
-        return action
-
-class PlayBuffer:
-    """
-    Class which run in a dedicated thread to store played games and generate batch.
-    """
-
-    def __init__(self, initial_checkpoint, initial_buffer, config):
-        self.config = config
-        self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{}
-        self.num_played_games = initial_checkpoint["num_played_games"]
-        self.num_played_steps = initial_checkpoint["num_played_steps"]
-        self.total_samples = sum(
-            [len(game_history.root_values) for game_history in self.buffer.values()]
-        )
-        if self.total_samples != 0:
-            print(
-                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
-            )
-
-        # Fix random generator seed
-        numpy.random.seed(self.config.seed)
-
-    def save_game(self, game_history):
-        self.buffer[self.num_played_games] = game_history
-        self.num_played_games += 1
-        self.num_played_steps += len(game_history.root_values)
-        self.total_samples += len(game_history.root_values)
-
-        if self.config.replay_buffer_size < len(self.buffer):
-            del_id = self.num_played_games - len(self.buffer)
-            self.total_samples -= len(self.buffer[del_id].root_values)
-            del self.buffer[del_id]
+import nevergrad
 
-    def get_buffer(self):
-        return self.buffer
-
-    def get_batch(self):
-        (
-            index_batch,
-            observation_batch,
-            action_batch,
-            reward_batch,
-            value_batch,
-            policy_batch,
-            gradient_scale_batch,
-        ) = ([], [], [], [], [], [], [])
-        weight_batch = None
-
-        for game_id, game_history, game_prob in self.sample_n_games(
-            self.config.batch_size
-        ):
-            game_pos, pos_prob = self.sample_position(game_history)
-
-            values, rewards, policies, actions = self.make_target(
-                game_history, game_pos
-            )
-
-            index_batch.append([game_id, game_pos])
-            observation_batch.append(
-                game_history.get_stacked_observations(
-                    game_pos,
-                    self.config.stacked_observations,
-                    len(self.config.action_space),
-                )
-            )
-            action_batch.append(actions)
-            value_batch.append(values)
-            reward_batch.append(rewards)
-            policy_batch.append(policies)
-            gradient_scale_batch.append(
-                [
-                    min(
-                        self.config.num_unroll_steps,
-                        len(game_history.action_history) - game_pos,
-                    )
-                ]
-                * len(actions)
-            )
-
-        # observation_batch: batch, channels, height, width
-        # action_batch: batch, num_unroll_steps+1
-        # value_batch: batch, num_unroll_steps+1
-        # reward_batch: batch, num_unroll_steps+1
-        # policy_batch: batch, num_unroll_steps+1, len(action_space)
-        # weight_batch: batch
-        # gradient_scale_batch: batch, num_unroll_steps+1
-        return (
-            index_batch,
-            (
-                observation_batch,
-                action_batch,
-                value_batch,
-                reward_batch,
-                policy_batch,
-                weight_batch,
-                gradient_scale_batch,
-            ),
-        )
-
-    def sample_game(self, force_uniform=True): #将force_uniform 设置为True，强制安装平均分布选取
-        """
-        Sample game from buffer either uniformly or according to some priority.
-        See paper appendix Training.
-        """
-        game_prob = None
-
-        game_index = numpy.random.choice(len(self.buffer))
-        game_id = self.num_played_games - len(self.buffer) + game_index
-
-        return game_id, self.buffer[game_id], game_prob
-
-    def sample_n_games(self, n_games):
-        selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
-        game_prob_dict = {}
-        ret = [
-            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
-            for game_id in selected_games
+if __name__ == "__main__":
+    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+    # start_time = time.time()
+    # muzero.train()
+    # end_time = time.time()
+    # print("耗时: {:.2f}秒".format(end_time - start_time))
+    model_cls = models.MuZeroNetwork
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
         ]
-        return ret
-
-    def sample_position(self, game_history):
-        """
-        Sample position from game either uniformly or according to some priority.
-        See paper appendix Training.
-        """
-        position_prob = None
-
-        position_index = numpy.random.choice(len(game_history.root_values))
-
-        return position_index, position_prob
-
-    def update_game_history(self, game_id, game_history):
-        # The element could have been removed since its selection and update
-        # if next(iter(self.buffer)) <= game_id:
-        #     self.buffer[game_id] = game_history
-
-        self.buffer[game_id] = game_history
-
-    def compute_target_value(self, game_history, index):
-        # The value target is the discounted root value of the search tree td_steps into the
-        # future, plus the discounted sum of all rewards until then.
-        bootstrap_index = index + self.config.td_steps
-        if bootstrap_index < len(game_history.root_values):
-            root_values = (
-                game_history.root_values
-                if game_history.reanalysed_predicted_root_values is None
-                else game_history.reanalysed_predicted_root_values
-            )
-            last_step_value = (
-                root_values[bootstrap_index]
-                if game_history.to_play_history[bootstrap_index]
-                == game_history.to_play_history[index]
-                else -root_values[bootstrap_index]
-            )
-
-            value = last_step_value * self.config.discount**self.config.td_steps
-        else:
-            value = 0
-
-        for i, reward in enumerate(
-            game_history.reward_history[index + 1 : bootstrap_index + 1]
-        ):
-            # The value is oriented from the perspective of the current player
-            value += (
-                reward
-                if game_history.to_play_history[index]
-                == game_history.to_play_history[index + i]
-                else -reward
-            ) * self.config.discount**i
-
-        return value
-
-    def make_target(self, game_history, state_index):
-        """
-        Generate targets for every unroll steps.
-        """
-        target_values, target_rewards, target_policies, actions = [], [], [], []
-        for current_index in range(
-            state_index, state_index + self.config.num_unroll_steps + 1
-        ):
-            value = self.compute_target_value(game_history, current_index)
-
-            if current_index < len(game_history.root_values):
-                target_values.append(value)
-                target_rewards.append(game_history.reward_history[current_index])
-                target_policies.append(game_history.child_visits[current_index])
-                actions.append(game_history.action_history[current_index])
-            elif current_index == len(game_history.root_values):
-                target_values.append(0)
-                target_rewards.append(game_history.reward_history[current_index])
-                # Uniform policy
-                target_policies.append(
-                    [
-                        1 / len(game_history.child_visits[0])
-                        for _ in range(len(game_history.child_visits[0]))
-                    ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
                 )
-                actions.append(game_history.action_history[current_index])
+                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
             else:
-                # States past the end of games are treated as absorbing states
-                target_values.append(0)
-                target_rewards.append(0)
-                # Uniform policy
-                target_policies.append(
-                    [
-                        1 / len(game_history.child_visits[0])
-                        for _ in range(len(game_history.child_visits[0]))
-                    ]
-                )
-                actions.append(numpy.random.choice(self.config.action_space))
-
-        return target_values, target_rewards, target_policies, actions
-
-class Trainer:
-    """
-    Class which run in a dedicated thread to train a neural network and save it
-    in the shared storage.
-    """
-
-    def __init__(self, initial_checkpoint, config):
-        self.config = config
-
-        # Fix random generator seed
-        numpy.random.seed(self.config.seed)
-        torch.manual_seed(self.config.seed)
-
-        # Initialize the network
-        self.model = models.MuZeroNetwork(self.config)
-        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
-        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
-        self.model.train()
-
-        self.training_step = initial_checkpoint["training_step"]
-
-        if "cuda" not in str(next(self.model.parameters()).device):
-            print("You are not training on GPU.\n")
-
-        # Initialize the optimizer
-        if self.config.optimizer == "SGD":
-            self.optimizer = torch.optim.SGD(
-                self.model.parameters(),
-                lr=self.config.lr_init,
-                momentum=self.config.momentum,
-                weight_decay=self.config.weight_decay,
-            )
-        elif self.config.optimizer == "Adam":
-            self.optimizer = torch.optim.Adam(
-                self.model.parameters(),
-                lr=self.config.lr_init,
-                weight_decay=self.config.weight_decay,
-            )
-        else:
-            raise NotImplementedError(
-                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
-            )
-
-        # if initial_checkpoint["optimizer_state"] is not None:
-        #     print("Loading optimizer...\n")
-        #     self.optimizer.load_state_dict(
-        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
-        #     )
-
-    # # update weights 与 continuous update weights 的区别
-    # #   1. update weights 是实际计算更新network的权重
-    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
-    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
-    #     next_batch = play_buffer.get_batch()
-    #     # Training loop
-    #     while self.training_step < self.config.training_steps and not terminate:
-    #         index_batch, batch = next_batch
-    #         next_batch = play_buffer.get_batch()
-    #         self.update_lr()
-    #         (
-    #             priorities,
-    #             total_loss,
-    #             value_loss,
-    #             reward_loss,
-    #             policy_loss,
-    #         ) = self.update_weights(batch)
-
-    def update_weights(self, batch):
-        """
-        Perform one training step.
-        """
-
-        (
-            observation_batch,
-            action_batch,
-            target_value,
-            target_reward,
-            target_policy,
-            weight_batch,
-            gradient_scale_batch,
-        ) = batch
-
-        # Keep values as scalars for calculating the priorities for the prioritized replay
-        target_value_scalar = numpy.array(target_value, dtype="float32")
-        priorities = numpy.zeros_like(target_value_scalar)
-
-        device = next(self.model.parameters()).device
-        observation_batch = (
-            torch.tensor(numpy.array(observation_batch)).float().to(device)
-        )
-        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
-        target_value = torch.tensor(target_value).float().to(device)
-        target_reward = torch.tensor(target_reward).float().to(device)
-        target_policy = torch.tensor(target_policy).float().to(device)
-        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
-        # observation_batch: batch, channels, height, width
-        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
-        # target_value: batch, num_unroll_steps+1
-        # target_reward: batch, num_unroll_steps+1
-        # target_policy: batch, num_unroll_steps+1, len(action_space)
-        # gradient_scale_batch: batch, num_unroll_steps+1
-
-        target_value = models.scalar_to_support(target_value, self.config.support_size)
-        target_reward = models.scalar_to_support(
-            target_reward, self.config.support_size
-        )
-        # target_value: batch, num_unroll_steps+1, 2*support_size+1
-        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
-
-        ## Generate predictions
-        value, reward, policy_logits, hidden_state = self.model.initial_inference(
-            observation_batch
-        )
-        predictions = [(value, reward, policy_logits)]
-        for i in range(1, action_batch.shape[1]):
-            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
-                hidden_state, action_batch[:, i]
-            )
-            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
-            hidden_state.register_hook(lambda grad: grad * 0.5)
-            predictions.append((value, reward, policy_logits))
-        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
-
-        ## Compute losses
-        value_loss, reward_loss, policy_loss = (0, 0, 0)
-        value, reward, policy_logits = predictions[0]
-        # Ignore reward loss for the first batch step
-        current_value_loss, _, current_policy_loss = self.loss_function(
-            value.squeeze(-1),
-            reward.squeeze(-1),
-            policy_logits,
-            target_value[:, 0],
-            target_reward[:, 0],
-            target_policy[:, 0],
-        )
-        value_loss += current_value_loss
-        policy_loss += current_policy_loss
-        # Compute priorities for the prioritized replay (See paper appendix Training)
-        pred_value_scalar = (
-            models.support_to_scalar(value, self.config.support_size)
-            .detach()
-            .cpu()
-            .numpy()
-            .squeeze()
-        )
-        priorities[:, 0] = (
-            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
-            ** self.config.PER_alpha
-        )
-
-        for i in range(1, len(predictions)):
-            value, reward, policy_logits = predictions[i]
-            (
-                current_value_loss,
-                current_reward_loss,
-                current_policy_loss,
-            ) = self.loss_function(
-                value.squeeze(-1),
-                reward.squeeze(-1),
-                policy_logits,
-                target_value[:, i],
-                target_reward[:, i],
-                target_policy[:, i],
-            )
-
-            # Scale gradient by the number of unroll steps (See paper appendix Training)
-            current_value_loss.register_hook(
-                lambda grad: grad / gradient_scale_batch[:, i]
-            )
-            current_reward_loss.register_hook(
-                lambda grad: grad / gradient_scale_batch[:, i]
-            )
-            current_policy_loss.register_hook(
-                lambda grad: grad / gradient_scale_batch[:, i]
-            )
-
-            value_loss += current_value_loss
-            reward_loss += current_reward_loss
-            policy_loss += current_policy_loss
-
-            # Compute priorities for the prioritized replay (See paper appendix Training)
-            pred_value_scalar = (
-                models.support_to_scalar(value, self.config.support_size)
-                .detach()
-                .cpu()
-                .numpy()
-                .squeeze()
-            )
-            priorities[:, i] = (
-                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
-                ** self.config.PER_alpha
-            )
-
-        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
-        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
-
-        # Mean over batch dimension (pseudocode do a sum)
-        loss = loss.mean()
-
-        # Optimize
-        self.optimizer.zero_grad()
-        loss.backward()
-        self.optimizer.step()
-        self.training_step += 1
-
-        return (
-            priorities,
-            # For log purpose
-            loss.item(),
-            value_loss.mean().item(),
-            reward_loss.mean().item(),
-            policy_loss.mean().item(),
-        )
-
-    def update_lr(self):
-        """
-        Update learning rate
-        """
-        lr = self.config.lr_init * self.config.lr_decay_rate ** (
-            self.training_step / self.config.lr_decay_steps
-        )
-        for param_group in self.optimizer.param_groups:
-            param_group["lr"] = lr
-
-    @staticmethod
-    def loss_function(
-        value,
-        reward,
-        policy_logits,
-        target_value,
-        target_reward,
-        target_policy,
-    ):
-        # Cross-entropy seems to have a better convergence than MSE
-        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
-        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
-        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1)
-
-        return value_loss, reward_loss, policy_loss
-
-
-def logging_loop(config, checkpoint, writer, training_steps):
-    # writer = SummaryWriter(config.results_path)
-
-    # print(
-    #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
-    # )
-
-    # Save hyperparameters to TensorBoard
-    hp_table = [
-        f"| {key} | {value} |" for key, value in config.__dict__.items()
-    ]
-    writer.add_text(
-        "Hyperparameters",
-        "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
-    )
-    # # Save model representation
-    # writer.add_text(
-    #     "Model summary",
-    #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
-    # )
-    # Loop for updating the training performance
-    counter = training_steps
-
-    try:
-        if True:
-        # while checkpoint["training_step"] < config.training_steps:
-            writer.add_scalar(
-                "1.Total_reward/1.Total_reward",
-                checkpoint["total_reward"],
-                counter,
-            )
-            writer.add_scalar(
-                "1.Total_reward/2.Mean_value",
-                checkpoint["mean_value"],
-                counter,
-            )
-            writer.add_scalar(
-                "1.Total_reward/3.Episode_length",
-                checkpoint["episode_length"],
-                counter,
-            )
-            writer.add_scalar(
-                "1.Total_reward/4.MuZero_reward",
-                checkpoint["muzero_reward"],
-                counter,
-            )
-            writer.add_scalar(
-                "1.Total_reward/5.Opponent_reward",
-                checkpoint["opponent_reward"],
-                counter,
-            )
-            writer.add_scalar(
-                "2.Workers/1.Self_played_games",
-                checkpoint["num_played_games"],
-                counter,
-            )
-            writer.add_scalar(
-                "2.Workers/2.Training_steps", checkpoint["training_step"], counter
-            )
-            writer.add_scalar(
-                "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter
-            )
-            writer.add_scalar(
-                "2.Workers/4.Reanalysed_games",
-                checkpoint["num_reanalysed_games"],
-                counter,
-            )
-            writer.add_scalar(
-                "2.Workers/5.Training_steps_per_self_played_step_ratio",
-                checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]),
-                counter,
-            )
-            writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter)
-            writer.add_scalar(
-                "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter
-            )
-            writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter)
-            writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter)
-            writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter)
-            print(
-                f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}',
-                end="\r",
-            )
-            counter += 1
-            # time.sleep(0.5)
-    except KeyboardInterrupt:
-        pass
-
-    # if config.save_model:
-    #     # Persist replay buffer to disk
-    #     path = config.results_path / "replay_buffer.pkl"
-    #     print(f"\n\nPersisting replay buffer games to disk at {path}")
-    #     pickle.dump(
-    #         {
-    #             "buffer": buffer,
-    #             "num_played_games": checkpoint["num_played_games"],
-    #             "num_played_steps": checkpoint["num_played_steps"],
-    #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
-    #         },
-    #         open(path, "wb"),
-    #     )
-
-def update_gameplay_checkpoint(config, checkpoint, game_history):
-    checkpoint["episode_length"] = len(game_history.action_history) - 1
-    checkpoint["total_reward"] = sum(game_history.reward_history)
-    checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
-
-    if 1 < len(config.players):
-        checkpoint["muzero_reward"] = sum(
-                    reward
-                    for i, reward in enumerate(game_history.reward_history)
-                    if game_history.to_play_history[i - 1]
-                    == config.muzero_player
-                )
-        checkpoint["opponent_reward"] = sum(
-                    reward
-                    for i, reward in enumerate(game_history.reward_history)
-                    if game_history.to_play_history[i - 1]
-                    != config.muzero_player
-                )
-
-def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中
-    if not path:
-        path = config.results_path / "model.checkpoint"
-
-    torch.save(checkpoint, path)
-
-def train(log_in_tensorboard=True):
-    config = MuZeroConfig()
-    config.results_path /= "muzero_without_rb"
-
-    if log_in_tensorboard or config.save_model:
-        config.results_path.mkdir(parents=True, exist_ok=True)
-
-    checkpoint = {
-        "weights": None,
-        "optimizer_state": None,
-        "total_reward": 0,
-        "muzero_reward": 0,
-        "opponent_reward": 0,
-        "episode_length": 0,
-        "mean_value": 0,
-        "training_step": 0,
-        "lr": 0,
-        "total_loss": 0,
-        "value_loss": 0,
-        "reward_loss": 0,
-        "policy_loss": 0,
-        "num_played_games": 0,
-        "num_played_steps": 0,
-        "num_reanalysed_games": 0,
-        "terminate": False,
-    }
-
-    trainer = Trainer(checkpoint, config)
-    selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed)
-    buffer = {}
-    play_buffer = PlayBuffer(checkpoint, buffer, config)
-
-    step = 1 # 间隔，即每次模拟后训练多少次
-    max_steps = int(config.training_steps/step)
-
-    writer = SummaryWriter(config.results_path)
-
-    for episode in range(max_steps):
-        game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0)
-
-        # print(game_id)
-        # print(game_history.action_history)
-        # print(game_history.reward_history)
-        # print(game_history.to_play_history)
-        # # print(game_history.observation_history)
-        # print("child visits", game_history.child_visits)
-        # print(game_history.root_values) # root value指的是root节点的UCB值
-
-        play_buffer.update_game_history(game_id, game_history)
-        update_gameplay_checkpoint(config, checkpoint, game_history)
-
-        for i in range(step):
-            index_batch, batch = play_buffer.get_batch()
-            # print(batch[1])
-            trainer.update_lr()
-            (
-                priorities,
-                total_loss,
-                value_loss,
-                reward_loss,
-                policy_loss,
-            ) = trainer.update_weights(batch)
-
-
-            training_step = episode * step + i
-            if training_step % config.checkpoint_interval == 0:
-                checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
-                checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
-
-                if config.save_model:
-                    save_checkpoint(config, checkpoint)
-            checkpoint["training_step"] = training_step
-            checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
-            checkpoint["total_loss"] = total_loss
-            checkpoint["value_loss"] = value_loss
-            checkpoint["reward_loss"] = reward_loss
-            checkpoint["policy_loss"] = policy_loss
-
-        # print(training_step)
-        # if training_step % 500 == 0:
-        # if training_step % config.checkpoint_interval == 0:
-        #     # print(training_step)
-        #     logging_loop(config, checkpoint, writer)
-
-        logging_loop(config, checkpoint, writer, training_step)
-
-
-    writer.close()
-
-    selfplay.close_game()
-
-if __name__ == "__main__":
-    start_time = time.time()
-    train()
-    end_time = time.time()
-    print("耗时: {:.2f}秒".format(end_time - start_time))
\ No newline at end of file
+                break
+            print("\nDone")
\ No newline at end of file
diff --git a/muzero_without_replay_buffer2.py b/muzero_without_replay_buffer2.py
deleted file mode 100644
index 4b87fc7b..00000000
--- a/muzero_without_replay_buffer2.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import models
-from muzero_general import MuZeroGeneral
-from muzero import load_model_menu, hyperparameter_search
-
-import json
-import sys
-import pathlib
-import time
-import nevergrad
-
-if __name__ == "__main__":
-    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
-    # start_time = time.time()
-    # muzero.train()
-    # end_time = time.time()
-    # print("耗时: {:.2f}秒".format(end_time - start_time))
-    model_cls = models.MuZeroNetwork
-    if len(sys.argv) == 2:
-        # Train directly with: python muzero.py cartpole
-        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
-        muzero.train()
-    elif len(sys.argv) == 3:
-        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
-        config = json.loads(sys.argv[2])
-        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
-        muzero.train()
-    else:
-        print("\nWelcome to MuZero! Here's a list of games:")
-        # Let user pick a game
-        games = [
-            filename.stem
-            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
-            if filename.name != "abstract_game.py"
-        ]
-        for i in range(len(games)):
-            print(f"{i}. {games[i]}")
-        choice = input("Enter a number to choose the game: ")
-        valid_inputs = [str(i) for i in range(len(games))]
-        while choice not in valid_inputs:
-            choice = input("Invalid input, enter a number listed above: ")
-
-        # Initialize MuZero
-        choice = int(choice)
-        game_name = games[choice]
-        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
-
-        while True:
-            # Configure running options
-            options = [
-                "Train",
-                "Load pretrained model",
-                "Diagnose model",
-                "Render some self play games",
-                "Play against MuZero",
-                "Test the game manually",
-                "Hyperparameter search",
-                "Exit",
-            ]
-            print()
-            for i in range(len(options)):
-                print(f"{i}. {options[i]}")
-
-            choice = input("Enter a number to choose an action: ")
-            valid_inputs = [str(i) for i in range(len(options))]
-            while choice not in valid_inputs:
-                choice = input("Invalid input, enter a number listed above: ")
-            choice = int(choice)
-            if choice == 0:
-                start_time = time.time()
-                muzero.train()
-                end_time = time.time()
-                print("耗时: {:.2f}秒".format(end_time - start_time))
-            elif choice == 1:
-                load_model_menu(muzero, game_name)
-            elif choice == 2:
-                muzero.diagnose_model(30)
-            elif choice == 3:
-                muzero.test(render=True, opponent="self", muzero_player=None)
-            elif choice == 4:
-                muzero.test(render=True, opponent="human", muzero_player=0)
-            elif choice == 5:
-                env = muzero.Game()
-                env.reset()
-                env.render()
-
-                done = False
-                while not done:
-                    action = env.human_to_action()
-                    observation, reward, done = env.step(action)
-                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
-                    env.render()
-            elif choice == 6:
-                # Define here the parameters to tune
-                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
-                muzero.terminate_workers()
-                del muzero
-                budget = 20
-                parallel_experiments = 2
-                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
-                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
-                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
-                best_hyperparameters = hyperparameter_search(
-                    game_name, parametrization, budget, parallel_experiments, 20
-                )
-                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
-            else:
-                break
-            print("\nDone")
\ No newline at end of file
diff --git a/simplifiedMuZero/net2/__init__.py b/simplifiedMuZero/net2/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/simplifiedMuZero/models2.py b/simplifiedMuZero/net2/models2.py
similarity index 98%
rename from simplifiedMuZero/models2.py
rename to simplifiedMuZero/net2/models2.py
index fd6aa6ee..c36e8095 100644
--- a/simplifiedMuZero/models2.py
+++ b/simplifiedMuZero/net2/models2.py
@@ -7,7 +7,6 @@
 
 class MuZeroNetwork_2net:
     def __new__(cls, config):
-        print("MuZeroNetwork_2net")
         if config.network == "fullyconnected":
             return MuZeroFullyConnectedNetwork_2net(
                 config.observation_shape,
@@ -57,6 +56,7 @@ def __init__(
         support_size,
     ):
         super().__init__()
+        print(self.__class__.__name__)
         self.action_space_size = action_space_size
         self.full_support_size = 2 * support_size + 1
         # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
@@ -100,6 +100,7 @@ def __init__(
             mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
+
     def prediction(self, encoded_state):
         policy_logits = self.prediction_policy_network(encoded_state)
         value = self.prediction_value_network(encoded_state)
@@ -128,10 +129,11 @@ def representation(self, observation):
 
         return self.encoded_stated_normalized(encoded_state)
 
+
     # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
     def dynamics(self, encoded_state, action):
         action_one_hot = (torch.zeros((action.shape[0], self.action_space_size)).to(action.device).float())
-        action_one_hot.scatter(1, action.long(), 1.0)
+        action_one_hot.scatter_(1, action.long(), 1.0)
         x = torch.cat((encoded_state, action_one_hot), dim=1)
 
         next_encoded_state = self.dynamics_encoded_state_network(x)
@@ -185,9 +187,6 @@ def __init__(
         downsample,
     ):
         super().__init__()
-        print("observation shape is ", observation_shape)
-        print("num channels is ", num_channels)
-
         num_channels = observation_shape[1]
         self.action_space_size = action_space_size
         self.full_support_size = 2 * support_size + 1
diff --git a/simplifiedMuZero/net2/replay_buffer_2net.py b/simplifiedMuZero/net2/replay_buffer_2net.py
index 55522b86..646611c1 100644
--- a/simplifiedMuZero/net2/replay_buffer_2net.py
+++ b/simplifiedMuZero/net2/replay_buffer_2net.py
@@ -5,7 +5,9 @@
 import ray
 import torch
 
-import simplifiedMuZero.net2.models_2net as models
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
 
 
 @ray.remote
@@ -318,7 +320,8 @@ def __init__(self, initial_checkpoint, config):
         torch.manual_seed(self.config.seed)
 
         # Initialize the network
-        self.model = models.SimplifiedMuZeroNetwork(self.config)
+        # self.model = models.SimplifiedMuZeroNetwork(self.config)
+        self.model = MuZeroNetwork_2net(self.config)
         self.model.set_weights(initial_checkpoint["weights"])
         self.model.to(torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu"))
         self.model.eval()
diff --git a/simplifiedMuZero/net2/self_play_2net.py b/simplifiedMuZero/net2/self_play_2net.py
index a0a208a8..5ca7bfbd 100644
--- a/simplifiedMuZero/net2/self_play_2net.py
+++ b/simplifiedMuZero/net2/self_play_2net.py
@@ -5,7 +5,9 @@
 import ray
 import torch
 
-import simplifiedMuZero.net2.models_2net as models
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
 
 
 @ray.remote
@@ -23,7 +25,7 @@ def __init__(self, initial_checkpoint, Game, config, seed):
         torch.manual_seed(seed)
 
         # Initialize the network
-        self.model = models.SimplifiedMuZeroNetwork(self.config)
+        self.model = MuZeroNetwork_2net(self.config)
         # self.model = models.MuZeroNetwork(self.config)
         self.model.set_weights(initial_checkpoint["weights"])
         self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
diff --git a/simplifiedMuZero/net2/trainer_2net.py b/simplifiedMuZero/net2/trainer_2net.py
index 567b8f9a..d11612bd 100644
--- a/simplifiedMuZero/net2/trainer_2net.py
+++ b/simplifiedMuZero/net2/trainer_2net.py
@@ -5,7 +5,9 @@
 import ray
 import torch
 
-import simplifiedMuZero.net2.models_2net as models
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
 
 
 @ray.remote
@@ -23,7 +25,7 @@ def __init__(self, initial_checkpoint, config):
         torch.manual_seed(self.config.seed)
 
         # Initialize the network
-        self.model = models.SimplifiedMuZeroNetwork(self.config)
+        self.model = MuZeroNetwork_2net(self.config)
         self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
         self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
         self.model.train()
diff --git a/simplifiedMuZero/no_pv/trainer_no_pv.py b/simplifiedMuZero/no_pv/trainer_no_pv.py
new file mode 100644
index 00000000..e4a6080c
--- /dev/null
+++ b/simplifiedMuZero/no_pv/trainer_no_pv.py
@@ -0,0 +1,301 @@
+import copy
+import time
+
+import numpy
+import ray
+import torch
+
+import models
+
+
+@ray.remote
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        if initial_checkpoint["optimizer_state"] is not None:
+            print("Loading optimizer...\n")
+            self.optimizer.load_state_dict(
+                copy.deepcopy(initial_checkpoint["optimizer_state"])
+            )
+
+    def continuous_update_weights(self, replay_buffer, shared_storage):
+        # Wait for the replay buffer to be filled
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        next_batch = replay_buffer.get_batch.remote()
+        # Training loop
+        while self.training_step < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+        ):
+            index_batch, batch = ray.get(next_batch)
+            next_batch = replay_buffer.get_batch.remote()
+            self.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = self.update_weights(batch)
+
+            if self.config.PER:
+                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
+                replay_buffer.update_priorities.remote(priorities, index_batch)
+
+            # Save to the shared storage
+            if self.training_step % self.config.checkpoint_interval == 0:
+                shared_storage.set_info.remote(
+                    {
+                        "weights": copy.deepcopy(self.model.get_weights()),
+                        "optimizer_state": copy.deepcopy(
+                            models.dict_to_cpu(self.optimizer.state_dict())
+                        ),
+                    }
+                )
+                if self.config.save_model:
+                    shared_storage.save_checkpoint.remote()
+            shared_storage.set_info.remote(
+                {
+                    "training_step": self.training_step,
+                    "lr": self.optimizer.param_groups[0]["lr"],
+                    "total_loss": total_loss,
+                    "value_loss": value_loss,
+                    "reward_loss": reward_loss,
+                    "policy_loss": policy_loss,
+                }
+            )
+
+            # Managing the self-play / training ratio
+            if self.config.training_delay:
+                time.sleep(self.config.training_delay)
+            if self.config.ratio:
+                while (
+                    self.training_step
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    > self.config.ratio
+                    and self.training_step < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+                ):
+                    time.sleep(0.5)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        if self.config.PER:
+            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        # loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        if self.config.PER:
+            # Correct PER bias by using importance-sampling (IS) weights
+            loss *= weight_batch
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups: # 更新optimizer的lr
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
diff --git a/simplifiedMuZero/search_policy/RHEA.py b/simplifiedMuZero/search_policy/RHEA.py
index d23c611b..fe070c8b 100644
--- a/simplifiedMuZero/search_policy/RHEA.py
+++ b/simplifiedMuZero/search_policy/RHEA.py
@@ -1,12 +1,75 @@
+import copy
+import numpy as np
+from functools import partial
+
+from deap import base, creator, tools, algorithms
+
+from games.abstract_game import AbstractGame
+
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
+
+class RHEA:
+    def __init__(self):
+        self.game = None
+        self.play_id = 0
+        self.toolbox = base.Toolbox()
+        self.register("mate", tools.cxTwoPoint)
+        self.register("mutate", tools.mutFlipBit, indpb=0.05)
+        self.register("select", tools.selStochasticUniversalSampling)
+
+    def game_evaluate(self, actions, game_stat=None, play_id=None):
+        game_stat = copy.deepcopy(game_stat)
+        game_stat.reset()
+
+        for i in range(len(actions)):
+            player = game_stat.to_play()
+            observation, reward, done = game_stat.step(actions[i])
+            if done:
+                break
+
+        game_stat.close()
+        reward = reward if play_id == player else -reward
+        # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+        reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+        return reward,
+
+    def evaluate(self, actions):
+        game_stat = copy.deepcopy(self.game)
+        play_id = self.play_id
+
+        game_stat.reset()
+
+        for i in range(len(actions)):
+            player = game_stat.to_play()
+            observation, reward, done = game_stat.step(actions[i])
+            if done:
+                break
+
+        game_stat.close()
+        reward = reward if play_id == player else -reward
+        # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+        reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+        return reward,
+
+    def individual(self, actions, max_moves, replace=False):
+        max_moves = max_moves if replace else len(actions)
+        return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace))
+    def population(self, actions, max_moves, N, replace=False):
+        return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N)
+
+    def rhea(self, game_state:AbstractGame, config, play_id):
+        actions = game_state.legal_actions()
+        pop = self.population(actions. config.max_moves)
+        self.toolbox.register("evaluate", self.game_evaluate, game=game_state, play_id=play_id)
+        pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+
+        results = tools.selBest(pop, k=1)
+
+        # 返回第一个动作和评分
+        return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作
+
+
+
 
 
-class RHEAIndividual:
-    def __init__(self, L:int, discount_factor:double, forword_model, state, play_id:int,
-                 seed, heuristic):
-        self.state = state
-        self.L = L
-        self.discount_factor = discount_factor
-        self.forword_model = forword_model
-        self.play_id = play_id
-        self.seed = seed
-        self.heuristic = heuristic
\ No newline at end of file
diff --git a/simplifiedMuZero/search_policy/RHEA2.py b/simplifiedMuZero/search_policy/RHEA2.py
new file mode 100644
index 00000000..73d30799
--- /dev/null
+++ b/simplifiedMuZero/search_policy/RHEA2.py
@@ -0,0 +1,192 @@
+import copy
+import numpy as np
+from functools import partial
+import torch
+
+from deap import base, creator, tools, algorithms
+
+from games.abstract_game import AbstractGame
+from self_play import Node
+import models
+
+from games.tictactoe import MuZeroConfig, Game
+
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
+
+
+def evaluate(actions, model, observation, config):
+    (
+        root_predicted_value,
+        reward,
+        policy_logits,
+        hidden_state,
+    ) = model.initial_inference(observation)
+
+    for action in actions:
+        value, reward, policy_logits, hidden_state = model.recurrent_inference(
+            hidden_state,
+            torch.tensor([[action]]).to(observation.device),
+        )
+
+    reward = models.support_to_scalar(reward, config.support_size).item()
+    return reward,
+
+class RHEA:
+    def __init__(self, config, game):
+        self.game = game
+        self.config = config
+        self.play_id = -1
+        self.toolbox = base.Toolbox()
+        self.toolbox.register("mate", tools.cxTwoPoint)
+        self.toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
+        self.toolbox.register("select", tools.selStochasticUniversalSampling)
+
+    # def game_evaluate(self, actions, game_stat=None, play_id=None):
+    #     game_stat = copy.deepcopy(game_stat)
+    #     game_stat.reset()
+    #
+    #     for i in range(len(actions)):
+    #         player = game_stat.to_play()
+    #         observation, reward, done = game_stat.step(actions[i])
+    #         if done:
+    #             break
+    #
+    #     game_stat.close()
+    #     reward = reward if play_id == player else -reward
+    #     # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    #     reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+    #     return reward,
+    #
+    # def action_evaluate(self, actions):
+    #     game_stat = copy.deepcopy(self.game)
+    #     game_stat.reset()
+    #
+    #     for i in range(len(actions)):
+    #         player = game_stat.to_play()
+    #         observation, reward, done = game_stat.step(actions[i])
+    #         if done:
+    #             break
+    #
+    #     game_stat.close()
+    #     reward = reward if self.play_id == player else -reward
+    #
+    #     return reward, actions[:(i+1)]
+    #
+    def evaluate(self, actions):
+        game_stat = copy.deepcopy(self.game)
+        play_id = self.play_id
+
+        game_stat.reset()
+
+        for i in range(len(actions)):
+            player = game_stat.to_play()
+            observation, reward, done = game_stat.step(actions[i])
+            if done:
+                break
+
+        game_stat.close()
+        reward = reward if play_id == player else -reward
+        # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+        reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+        return reward,
+
+    def individual(self, actions, max_moves, replace=False):
+        max_moves = max_moves if replace else min(len(actions), max_moves)
+        return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace))
+    def population(self, actions, max_moves, N, replace=False):
+        return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N)
+
+    # def rhea(self, game_state:AbstractGame):
+    #     self.game = game_state
+    #     self.play_id = game_state.to_play()
+    #     actions = game_state.legal_actions()
+    #     self.toolbox.register("evaluate", evaluate, )
+    #     pop = self.population(actions. self.config.max_moves)
+    #
+    #     pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+    #
+    #     results = tools.selBest(pop, k=1)
+    #
+    #     return self.action_evaluate(results[0])
+
+
+
+        # # 返回第一个动作和评分
+        # return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作
+
+    def run(self,
+            model,
+            observation,
+            legal_actions,
+            to_play,
+            action_replace,
+            override_root_with=None,
+            ):
+        observation = (
+            torch.tensor(observation)
+            .float()
+            .unsqueeze(0)
+            .to(next(model.parameters()).device)
+        )
+
+        # 检查可用的动作空间，如果小于等于1，则直接返回。因为进化算法无法杂交，会报错
+        if len(legal_actions) <=1:
+            return legal_actions
+        else:
+            # self.toolbox.register("evaluate", evaluate, model=model, observation=observation, config=self.config)
+            self.toolbox.register("evaluate", self.evaluate)
+            pop = self.population(legal_actions, self.config.max_moves, self.config.num_simulations, replace=action_replace)
+
+            pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False)
+
+            results = tools.selBest(pop, k=1)
+
+            return results[0]
+
+if __name__=="__main__":
+    game = Game()
+    config = MuZeroConfig()
+    game.reset()
+    done = False
+
+    # rhea = RHEA(config, game)
+    # pop = rhea.population(game.legal_actions(), 9, config.num_simulations, config.action_replace)
+    #
+    # print(pop)
+    # rhea.toolbox.register("evaluate", rhea.evaluate)
+    # pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0, ngen=9, verbose=False)
+    #
+    # results = tools.selBest(pop, k=1)
+    # print(results)
+
+    legal_actions = game.legal_actions()
+    while not done and len(legal_actions) >1:
+        legal_actions = game.legal_actions()
+        rhea = RHEA(config, game)
+        rhea.play_id = game.to_play()
+
+        pop = rhea.population(legal_actions, config.max_moves, config.num_simulations, config.action_replace)
+
+        rhea.toolbox.register("evaluate", rhea.evaluate)
+
+        pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False)
+
+        print(pop)
+        results = tools.selBest(pop, k=1)
+        print(results)
+        action = results[0][0]
+        observation, reward, done = game.step(action)
+        # print(observation)
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/simplifiedMuZero/search_policy/rhea_self_play.py b/simplifiedMuZero/search_policy/rhea_self_play.py
new file mode 100644
index 00000000..ca49d875
--- /dev/null
+++ b/simplifiedMuZero/search_policy/rhea_self_play.py
@@ -0,0 +1,227 @@
+import math
+import time
+
+import numpy
+import ray
+import torch
+
+import models
+from simplifiedMuZero.search_policy.RHEA2 import RHEA
+from self_play import GameHistory
+
+
+@ray.remote
+class SelfPlayRhea:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        # self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+
+    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
+
+            if not test_mode:
+                game_history = self.play_game(
+                    self.config.visit_softmax_temperature_fn(
+                        trained_steps=ray.get(
+                            shared_storage.get_info.remote("training_step")
+                        )
+                    ),
+                    self.config.temperature_threshold,
+                    False,
+                    "self",
+                    0,
+                )
+
+                replay_buffer.save_game.remote(game_history, shared_storage)
+
+            else:
+                # Take the best action (no exploration) in test mode
+                game_history = self.play_game(
+                    0,
+                    self.config.temperature_threshold,
+                    False,
+                    "self" if len(self.config.players) == 1 else self.config.opponent,
+                    self.config.muzero_player,
+                )
+
+                # Save to the shared storage
+                shared_storage.set_info.remote(
+                    {
+                        "episode_length": len(game_history.action_history) - 1,
+                        "total_reward": sum(game_history.reward_history),
+                        "mean_value": numpy.mean(
+                            [value for value in game_history.root_values if value]
+                        ),
+                    }
+                )
+                if 1 < len(self.config.players):
+                    shared_storage.set_info.remote(
+                        {
+                            "muzero_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                == self.config.muzero_player
+                            ),
+                            "opponent_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                != self.config.muzero_player
+                            ),
+                        }
+                    )
+
+            # Managing the self-play / training ratio
+            if not test_mode and self.config.self_play_delay:
+                time.sleep(self.config.self_play_delay)
+            if not test_mode and self.config.ratio:
+                while (
+                    ray.get(shared_storage.get_info.remote("training_step"))
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    < self.config.ratio
+                    and ray.get(shared_storage.get_info.remote("training_step"))
+                    < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+        self.close_game()
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        if render:
+            self.game.render()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    # root, mcts_info = MCTS(self.config).run(
+                    #     self.model,
+                    #     stacked_observations,
+                    #     self.game.legal_actions(),
+                    #     self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                    #     True,
+                    # )
+                    # action = self.select_action(
+                    #     root,
+                    #     temperature
+                    #     if not temperature_threshold
+                    #     or len(game_history.action_history) < temperature_threshold
+                    #     else 0,
+                    # ) # 根据temperature选择动作
+                    actions = RHEA(self.config, self.game).run(self.model,
+                                          stacked_observations,
+                                          self.game.legal_actions(),
+                                          self.game.to_play(),
+                                          self.config.action_replace,
+                                          )
+                    action = actions[0]
+
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                # game_history.store_search_statistics(root, self.config.action_space)
+                game_history.root_values.append(reward)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            return self.game.human_to_action(), None
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
diff --git a/simplified_muzero.py b/simplified_muzero.py
index cd99153e..11cf7591 100644
--- a/simplified_muzero.py
+++ b/simplified_muzero.py
@@ -1,4 +1,4 @@
-from simplifiedMuZero.net2.models_2net import SimplifiedMuZeroNetwork
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
 from muzero_general import MuZeroGeneral
 from muzero import load_model_menu, hyperparameter_search
 
@@ -14,7 +14,7 @@
     # muzero.train()
     # end_time = time.time()
     # print("耗时: {:.2f}秒".format(end_time - start_time))
-    model_cls = SimplifiedMuZeroNetwork
+    model_cls = MuZeroNetwork_2net
     if len(sys.argv) == 2:
         # Train directly with: python muzero.py cartpole
         muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
diff --git a/simplified_muzero2.py b/simplified_muzero2.py
deleted file mode 100644
index a136dd44..00000000
--- a/simplified_muzero2.py
+++ /dev/null
@@ -1,108 +0,0 @@
-from simplifiedMuZero.models2 import MuZeroNetwork_2net
-from muzero_general import MuZeroGeneral
-from muzero import load_model_menu, hyperparameter_search
-
-import json
-import sys
-import pathlib
-import time
-import nevergrad
-
-if __name__ == "__main__":
-    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
-    # start_time = time.time()
-    # muzero.train()
-    # end_time = time.time()
-    # print("耗时: {:.2f}秒".format(end_time - start_time))
-    model_cls = MuZeroNetwork_2net
-    if len(sys.argv) == 2:
-        # Train directly with: python muzero.py cartpole
-        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
-        muzero.train()
-    elif len(sys.argv) == 3:
-        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
-        config = json.loads(sys.argv[2])
-        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
-        muzero.train()
-    else:
-        print("\nWelcome to MuZero! Here's a list of games:")
-        # Let user pick a game
-        games = [
-            filename.stem
-            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
-            if filename.name != "abstract_game.py"
-        ]
-        for i in range(len(games)):
-            print(f"{i}. {games[i]}")
-        choice = input("Enter a number to choose the game: ")
-        valid_inputs = [str(i) for i in range(len(games))]
-        while choice not in valid_inputs:
-            choice = input("Invalid input, enter a number listed above: ")
-
-        # Initialize MuZero
-        choice = int(choice)
-        game_name = games[choice]
-        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
-
-        while True:
-            # Configure running options
-            options = [
-                "Train",
-                "Load pretrained model",
-                "Diagnose model",
-                "Render some self play games",
-                "Play against MuZero",
-                "Test the game manually",
-                "Hyperparameter search",
-                "Exit",
-            ]
-            print()
-            for i in range(len(options)):
-                print(f"{i}. {options[i]}")
-
-            choice = input("Enter a number to choose an action: ")
-            valid_inputs = [str(i) for i in range(len(options))]
-            while choice not in valid_inputs:
-                choice = input("Invalid input, enter a number listed above: ")
-            choice = int(choice)
-            if choice == 0:
-                start_time = time.time()
-                muzero.train()
-                end_time = time.time()
-                print("耗时: {:.2f}秒".format(end_time - start_time))
-            elif choice == 1:
-                load_model_menu(muzero, game_name)
-            elif choice == 2:
-                muzero.diagnose_model(30)
-            elif choice == 3:
-                muzero.test(render=True, opponent="self", muzero_player=None)
-            elif choice == 4:
-                muzero.test(render=True, opponent="human", muzero_player=0)
-            elif choice == 5:
-                env = muzero.Game()
-                env.reset()
-                env.render()
-
-                done = False
-                while not done:
-                    action = env.human_to_action()
-                    observation, reward, done = env.step(action)
-                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
-                    env.render()
-            elif choice == 6:
-                # Define here the parameters to tune
-                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
-                muzero.terminate_workers()
-                del muzero
-                budget = 20
-                parallel_experiments = 2
-                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
-                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
-                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
-                best_hyperparameters = hyperparameter_search(
-                    game_name, parametrization, budget, parallel_experiments, 20
-                )
-                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
-            else:
-                break
-            print("\nDone")
\ No newline at end of file
diff --git a/test/deap_test.py b/test/deap_test.py
index 0ec02e8e..51b930c8 100644
--- a/test/deap_test.py
+++ b/test/deap_test.py
@@ -1,3 +1,4 @@
+import copy
 import random
 
 import deap
@@ -5,40 +6,115 @@
 import numpy as np
 
 config = MuZeroConfig()
-print(config.max_moves)
 
 from deap import base, creator, tools
 import numpy as np
 # 定义问题
-creator.create('FitnessMax', base.Fitness, weights=(-1.0,)) #优化目标：单变量，求最小值
-creator.create('Individual', list, fitness = creator.FitnessMax) #创建Individual类，继承list
+# creator创建的是类，第一个参数是类名，第二个参数是基类，后面的是其它参数
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
 
 legal_actions = 9
 
 toolbox = base.Toolbox()
-toolbox.register("Indices", random.sample, range(legal_actions), legal_actions)
-toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Indices)
+# 注册生成基因的函数。第一个参数是函数名，因此下面的调用是toolbox.Actions。
+# 第二鸽参数是生成action的函数。
+# 后边的参数是生成函数的参数，如此为np.random.choice(range(n), N, replace=False)
+toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False)
+# tools.initIterate返回一个生成的动作序列
+toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions)
 
-ind1 = toolbox.Individual()
-print(ind1)
+# ind1 = toolbox.Individual()
+# print(ind1)
 
+# 重复生成动作序列
 toolbox.register("population", tools.initRepeat, list, toolbox.Individual)
 
-pop = toolbox.population(n=36)
-print(len(pop))
+# pop = toolbox.population(n=36)
+# print(len(pop))
 
-def ea(game):
-    pass
 
-# game = Game(0)
-# game.reset()
-#
-# for i in range(9):
-#     game.render()
+
+game = Game(0)
+game2 = copy.deepcopy(game)
+game.reset()
+game2.reset()
+
+actions = game.legal_actions()
+np.random.shuffle(actions)
+
+# for i in range(config.max_moves):
+#     # game.render()
 #     print(game.legal_actions())
 #     observation, reward, done = game.step(np.random.choice(game.legal_actions()))
 #
 #     if done:
 #         break
+
+def evaluate(actions):
+    game = Game(1)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if 0 == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+
+
+def game_evaluate(actions, game=None, play_id=None):
+    game = copy.deepcopy(game)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if play_id == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i+1 # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+        # print(actions[i])
+        # game.render()
+
+toolbox.register("evaluate", game_evaluate, game=game, play_id = 0)
+# toolbox.register("evaluate", evaluate)
+toolbox.register("mate", tools.cxTwoPoint)
+toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
+# toolbox.register("select", tools.selTournament, tournsize=2000)
+# toolbox.register("select", tools.selBest)
+toolbox.register("select", tools.selStochasticUniversalSampling)
+
+pop = toolbox.population(n=100)
+
+# from deap import algorithms
+# pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+# # print(logbook)
+# result = tools.selBest(pop, k=1)
+
+results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]]
+print(results)
+print(evaluate(results[0]))
+reward = game_evaluate(results[0],game,0)
+print(reward)
+
+# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0)
+# print(reward)
 #
+# for i in range(20):
+#     print(game_evaluate(pop[i], game, 0))
+
+# print(evaluate(actions, game, 0))
+
+# print(actions[:i])
 # game.render()
+# game2.render()
diff --git a/test/deap_test2.py b/test/deap_test2.py
new file mode 100644
index 00000000..ad6de6bc
--- /dev/null
+++ b/test/deap_test2.py
@@ -0,0 +1,119 @@
+import copy
+import random
+
+import deap
+from games.tictactoe import Game, MuZeroConfig
+import numpy as np
+from functools import partial
+
+config = MuZeroConfig()
+
+from deap import base, creator, tools
+import numpy as np
+# 定义问题
+# creator创建的是类，第一个参数是类名，第二个参数是基类，后面的是其它参数
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
+
+legal_actions = 9
+
+toolbox = base.Toolbox()
+# 注册生成基因的函数。第一个参数是函数名，因此下面的调用是toolbox.Actions。
+# 第二鸽参数是生成action的函数。
+# 后边的参数是生成函数的参数，如此为np.random.choice(range(n), N, replace=False)
+# toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False)
+# # tools.initIterate返回一个生成的动作序列
+# toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions)
+
+def individual(actions, max_moves, replace=False):
+    max_moves = max_moves if replace else len(actions)
+    return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace))
+
+# print(individual([0,1,2,3,4], 9, replace=False))
+# print(individual([0,1,2,3,4], 9, replace=True))
+# exit()
+
+def population(actions, max_moves, N, replace=False):
+    return tools.initRepeat(list, partial(individual, actions, max_moves, replace), N)
+
+pop = population(range(9),9,  N=4, replace=False)
+print(pop)
+
+# exit()
+#
+# # 重复生成动作序列
+# toolbox.register("population", tools.initRepeat, list, toolbox.Individual)
+
+game = Game(0)
+
+actions = game.legal_actions()
+np.random.shuffle(actions)
+
+def evaluate(actions):
+    game = Game(1)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if 0 == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+
+
+def game_evaluate(actions, game=None, play_id=None):
+    game = copy.deepcopy(game)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if play_id == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i+1 # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+        # print(actions[i])
+        # game.render()
+
+toolbox.register("evaluate", game_evaluate, game=game, play_id = 0)
+# toolbox.register("evaluate", evaluate)
+toolbox.register("mate", tools.cxTwoPoint)
+toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
+# toolbox.register("select", tools.selTournament, tournsize=2000)
+# toolbox.register("select", tools.selBest)
+toolbox.register("select", tools.selStochasticUniversalSampling)
+
+# pop = toolbox.population(n=100)
+# pop = [[0, 6, 8, 7, 4, 5, 2, 1, 3], [0, 6, 3, 7, 4, 5, 2, 1, 8]]
+
+from deap import algorithms
+pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+# # print(logbook)
+results = tools.selBest(pop, k=1)
+
+# results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]]
+print(results)
+print(evaluate(results[0]))
+reward = game_evaluate(results[0],game,0)
+print(reward)
+
+# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0)
+# print(reward)
+#
+# for i in range(20):
+#     print(game_evaluate(pop[i], game, 0))
+
+# print(evaluate(actions, game, 0))
+
+# print(actions[:i])
+# game.render()
+# game2.render()
diff --git a/test/load_model.py b/test/load_model.py
new file mode 100644
index 00000000..88e83520
--- /dev/null
+++ b/test/load_model.py
@@ -0,0 +1,12 @@
+import torch
+
+import simplifiedMuZero.net2.models2 as models
+from games.tictactoe import Game, MuZeroConfig
+
+from game_tournament import load_model
+
+config = MuZeroConfig()
+
+muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+muzero_2net_model = load_model(models.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config)
+

From 50c2c013a1a0731d50b96a94473686d760aeae21 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Wed, 23 Aug 2023 22:45:30 +0100
Subject: [PATCH 7/9] Replace fully connected network replace resnet in
 Tic-tac-toe

---
 game_tournament.py  |   3 +-
 game_tournament2.py | 389 +++++++++++++++++++++++++++++++++++++++++++
 game_tournament3.py | 390 ++++++++++++++++++++++++++++++++++++++++++++
 games/tictactoe2.py | 361 ++++++++++++++++++++++++++++++++++++++++
 games/tictactoe3.py | 354 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1496 insertions(+), 1 deletion(-)
 create mode 100644 game_tournament2.py
 create mode 100644 game_tournament3.py
 create mode 100644 games/tictactoe2.py
 create mode 100644 games/tictactoe3.py

diff --git a/game_tournament.py b/game_tournament.py
index 9e8499e5..8c87e7ef 100644
--- a/game_tournament.py
+++ b/game_tournament.py
@@ -345,7 +345,8 @@ def load_model(model_cls, model_path, config):
     # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
     # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
 
-    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
     muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config)
 
     uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
diff --git a/game_tournament2.py b/game_tournament2.py
new file mode 100644
index 00000000..cfdd56d6
--- /dev/null
+++ b/game_tournament2.py
@@ -0,0 +1,389 @@
+import pickle
+
+import torch
+import copy
+import numpy
+
+from games.tictactoe import MuZeroConfig, Game
+import models
+import simplifiedMuZero.net2.models2 as models2
+from self_play import MCTS, GameHistory,SelfPlay
+
+class GameTournament:
+    def __init__(self, config:MuZeroConfig):
+        self.models = []
+        self.game = Game(config.seed)
+        self.config = config
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 0
+
+    def have_winner(self):
+        # Horizontal and vertical checks
+        for i in range(3):
+            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+
+        # Diagonal checks
+        if (
+            self.board[0, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[2, 2] == self.player
+        ):
+            return True
+        if (
+            self.board[2, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[0, 2] == self.player
+        ):
+            return True
+
+        return False
+
+    def play_competition(self, model1, search_policy1, model2, search_policy2):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model1.eval()
+        model2.eval()
+
+        is_model1 = True
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+            model = model1 if is_model1 else model2
+            search_policy = search_policy1 if is_model1 else search_policy2
+
+            root, mcts_info = search_policy(self.config).run(
+                model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                True,
+            )
+
+            action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model1 = not is_model1
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model1 == (reward > 0)
+
+    def play_with_expert(self, model, search_policy, expert_first=True):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model.eval()
+
+        is_model = not expert_first
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+
+            if is_model:
+                root, mcts_info = search_policy(self.config).run(
+                    model,
+                    stacked_observations,
+                    self.game.legal_actions(),
+                    self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                    True,
+                )
+                action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            else:
+                action = self.game.expert_agent()
+                root = None
+
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model = not is_model
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model == (reward > 0)
+
+    def close_game(self):
+        self.game.close()
+
+    def play_tournament(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
+                for _ in range(rollnum):
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model1_win_num += 1
+                        else:
+                            model2_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+                #  # 交换顺序，再来一遍
+                # for _ in range(rollnum):
+                #     have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS)
+                #
+                #     if have_winner:
+                #         if is_model1:
+                #             model2_win_num += 1
+                #         else:
+                #             model1_win_num += 1
+                #     else:
+                #         no_winner_num += 1
+
+                # print(is_model1)
+
+                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
+
+                print(models[i]["name"], " win  :   ", model1_win_num)
+                print(models[j]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+        model1_win_num = 0
+        model2_win_num = 0
+        no_winner_num = 0
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
+                for _ in range(rollnum):
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model1_win_num += 1
+                        else:
+                            model2_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+
+                print(models[j]["name"],"   ,", models[i]["name"]," :   ")
+
+                print(models[j]["name"], " win  :   ", model1_win_num)
+                print(models[i]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+    def play_tournament_with_expert(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            model = models[i]["model"]
+
+            # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+
+            for _ in range(rollnum):
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+
+            print(models[i]["name"], "   ,", "expert :   ")
+
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("expert win  :   ", expert_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
+
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+            for _ in range(rollnum):
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+            print("expert :   ", "   ,", models[i]["name"])
+
+            print("expert win  :   ", expert_win_num)
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
+
+
+
+def load_model(model_cls, model_path, config):
+    checkpoint = torch.load(model_path)
+    model = model_cls(config)
+    model.set_weights(checkpoint["weights"])
+
+    return model
+
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+
+    config.network = "fullyconnected"
+    # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-23--14-25-59\model.checkpoint"
+    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config)
+
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
+
+    config2 = MuZeroConfig()
+    config2.network = "resnet"
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
+    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2)
+
+    # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
+    # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
+    #
+    # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
+    # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
+    #
+    # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
+    #
+    #
+    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
+    #
+    # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
+    # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
+
+
+    game_tournament = GameTournament(config)
+
+    models = [
+        {"name":"muzero_2net", "model":muzero_2net_model},
+        # {"name":"uniform", "model":uniform_model},
+        {"name":"muzero", "model":muzero_model},
+        # {"name": "without_rb", "model": without_rb_model},
+        # {"name": "no policy value", "model": muzero_no_policy_model},
+        # {"name": "simplified_muzero", "model": without_rb_model},
+    ]
+
+
+    # game_tournament.play_tournament(models, rollnum=1000)
+    game_tournament.play_tournament(models, rollnum=10)
+    game_tournament.play_tournament_with_expert(models, rollnum=100)
+
+    game_tournament.close_game()
+
diff --git a/game_tournament3.py b/game_tournament3.py
new file mode 100644
index 00000000..14d1dec7
--- /dev/null
+++ b/game_tournament3.py
@@ -0,0 +1,390 @@
+import pickle
+
+import torch
+import copy
+import numpy
+
+from games.tictactoe2 import MuZeroConfig, Game
+import models
+import simplifiedMuZero.net2.models2 as models2
+from self_play import MCTS, GameHistory,SelfPlay
+
+class GameTournament:
+    def __init__(self, config:MuZeroConfig):
+        self.models = []
+        self.game = Game(config.seed)
+        self.config = config
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 0
+
+    def have_winner(self):
+        # Horizontal and vertical checks
+        for i in range(3):
+            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+
+        # Diagonal checks
+        if (
+            self.board[0, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[2, 2] == self.player
+        ):
+            return True
+        if (
+            self.board[2, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[0, 2] == self.player
+        ):
+            return True
+
+        return False
+
+    def play_competition(self, model1, search_policy1, model2, search_policy2):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model1.eval()
+        model2.eval()
+
+        is_model1 = True
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+            model = model1 if is_model1 else model2
+            search_policy = search_policy1 if is_model1 else search_policy2
+
+            root, mcts_info = search_policy(self.config).run(
+                model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                True,
+            )
+
+            action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model1 = not is_model1
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model1 == (reward > 0)
+
+    def play_with_expert(self, model, search_policy, expert_first=True):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model.eval()
+
+        is_model = not expert_first
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+
+            if is_model:
+                root, mcts_info = search_policy(self.config).run(
+                    model,
+                    stacked_observations,
+                    self.game.legal_actions(),
+                    self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                    True,
+                )
+                action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            else:
+                action = self.game.expert_agent()
+                root = None
+
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model = not is_model
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model == (reward > 0)
+
+    def close_game(self):
+        self.game.close()
+
+    def play_tournament(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
+                for _ in range(rollnum):
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model1_win_num += 1
+                        else:
+                            model2_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+                #  # 交换顺序，再来一遍
+                # for _ in range(rollnum):
+                #     have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS)
+                #
+                #     if have_winner:
+                #         if is_model1:
+                #             model2_win_num += 1
+                #         else:
+                #             model1_win_num += 1
+                #     else:
+                #         no_winner_num += 1
+
+                # print(is_model1)
+
+                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
+
+                print(models[i]["name"], " win  :   ", model1_win_num)
+                print(models[j]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+        model1_win_num = 0
+        model2_win_num = 0
+        no_winner_num = 0
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
+                for _ in range(rollnum):
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model1_win_num += 1
+                        else:
+                            model2_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+
+                print(models[j]["name"],"   ,", models[i]["name"]," :   ")
+
+                print(models[j]["name"], " win  :   ", model1_win_num)
+                print(models[i]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+    def play_tournament_with_expert(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            model = models[i]["model"]
+
+            # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+
+            for _ in range(rollnum):
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+
+            print(models[i]["name"], "   ,", "expert :   ")
+
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("expert win  :   ", expert_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
+
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+            for _ in range(rollnum):
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+            print("expert :   ", "   ,", models[i]["name"])
+
+            print("expert win  :   ", expert_win_num)
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
+
+
+
+def load_model(model_cls, model_path, config):
+    checkpoint = torch.load(model_path)
+    model = model_cls(config)
+    model.set_weights(checkpoint["weights"])
+
+    return model
+
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+
+    # config.network = "fullyconnected"
+    # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint"
+    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config)
+
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
+
+    config2 = MuZeroConfig()
+    config2.network = "resnet"
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
+    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2)
+
+    # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
+    # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
+    #
+    # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
+    # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
+    #
+    # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
+    #
+    #
+    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
+    #
+    # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
+    # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
+
+
+    game_tournament = GameTournament(config)
+
+    models = [
+        {"name":"muzero_2net", "model":muzero_2net_model},
+        # {"name":"uniform", "model":uniform_model},
+        {"name":"muzero", "model":muzero_model},
+        {"name": "muzero2", "model": muzero_model},
+        # {"name": "without_rb", "model": without_rb_model},
+        # {"name": "no policy value", "model": muzero_no_policy_model},
+        # {"name": "simplified_muzero", "model": without_rb_model},
+    ]
+
+
+    # game_tournament.play_tournament(models, rollnum=1000)
+    game_tournament.play_tournament(models, rollnum=10)
+    game_tournament.play_tournament_with_expert(models, rollnum=10)
+
+    game_tournament.close_game()
+
diff --git a/games/tictactoe2.py b/games/tictactoe2.py
new file mode 100644
index 00000000..ff9a90bf
--- /dev/null
+++ b/games/tictactoe2.py
@@ -0,0 +1,361 @@
+import datetime
+import pathlib
+
+import numpy
+import torch
+
+from .abstract_game import AbstractGame
+
+
+class MuZeroConfig:
+    def __init__(self):
+        # fmt: off
+        # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
+
+        self.seed = 0  # Seed for numpy, torch and the game
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
+
+
+
+        ### Game
+        self.observation_shape = (3, 3, 3)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
+        self.action_space = list(range(9))  # Fixed list of all possible actions. You should only edit the length
+        self.players = list(range(2))  # List of players. You should only edit the length
+        self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
+
+        # Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
+
+        # 动作是否能重复
+        self.action_replace = False
+
+        ### Self-Play
+        self.num_workers = 1  # Number of simultaneous threads/workers self-playing to feed the replay buffer
+        self.selfplay_on_gpu = False
+        self.max_moves = 9  # Maximum number of moves if game is not finished before
+        self.num_simulations = 25  # Number of future moves self-simulated
+        self.discount = 1  # Chronological discount of the reward
+        self.temperature_threshold = None  # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time
+
+        # Root prior exploration noise
+        self.root_dirichlet_alpha = 0.1
+        self.root_exploration_fraction = 0.25
+
+        # UCB formula
+        self.pb_c_base = 19652
+        self.pb_c_init = 1.25
+
+
+
+        ### Network
+        # self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.network = "fullyconnected"
+        self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
+
+        # Residual Network
+        self.downsample = False  # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture)
+        self.blocks = 1  # Number of blocks in the ResNet
+        self.channels = 16  # Number of channels in the ResNet
+        self.reduced_channels_reward = 16  # Number of channels in reward head
+        self.reduced_channels_value = 16  # Number of channels in value head
+        self.reduced_channels_policy = 16  # Number of channels in policy head
+        self.resnet_fc_reward_layers = [8]  # Define the hidden layers in the reward head of the dynamic network
+        self.resnet_fc_value_layers = [8]  # Define the hidden layers in the value head of the prediction network
+        self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
+
+        # Fully Connected Network
+        # self.encoding_size = 32
+        # self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        # self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
+        # self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
+        # self.fc_value_layers = []  # Define the hidden layers in the value network
+        # self.fc_policy_layers = []  # Define the hidden layers in the policy network
+
+        self.encoding_size = 32
+        self.fc_representation_layers = [16]  # Define the hidden layers in the representation network
+        self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
+        self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
+        self.fc_value_layers = [16]  # Define the hidden layers in the value network
+        self.fc_policy_layers = [16]
+
+
+        ### Training
+        self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
+        self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
+        # self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        # self.training_steps = 50000
+        self.training_steps = 500000
+        self.batch_size = 64  # Number of parts of games to train on at each training step
+        self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
+        self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
+
+        self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
+        self.weight_decay = 1e-4  # L2 weights regularization
+        self.momentum = 0.9  # Used only if optimizer is SGD
+
+        # Exponential learning rate schedule
+        self.lr_init = 0.003  # Initial learning rate
+        self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
+        self.lr_decay_steps = 10000
+
+
+
+        ### Replay Buffer
+        self.replay_buffer_size = 3000  # Number of self-play games to keep in the replay buffer
+        self.num_unroll_steps = 20  # Number of game moves to keep for every batch element
+        self.td_steps = 20  # Number of steps in the future to take into account for calculating the target value
+        self.PER = True  # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
+
+        # Reanalyze (See paper appendix Reanalyse)
+        self.use_last_model_value = True  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+        self.reanalyse_on_gpu = False
+
+
+
+        ### Adjust the self play / training ratio to avoid over/underfitting
+        self.self_play_delay = 0  # Number of seconds to wait after each played game
+        self.training_delay = 0  # Number of seconds to wait after each training step
+        self.ratio = None  # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
+        # fmt: on
+
+    def visit_softmax_temperature_fn(self, trained_steps):
+        """
+        Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.
+        The smaller it is, the more likely the best action (ie with the highest visit count) is chosen.
+
+        Returns:
+            Positive float.
+        """
+        return 1
+
+
+class Game(AbstractGame):
+    """
+    Game wrapper.
+    """
+
+    def __init__(self, seed=None):
+        self.env = TicTacToe()
+
+    def step(self, action):
+        """
+        Apply action to the game.
+
+        Args:
+            action : action of the action_space to take.
+
+        Returns:
+            The new observation, the reward and a boolean if the game has ended.
+        """
+        observation, reward, done = self.env.step(action)
+        return observation, reward * 20, done
+
+    def to_play(self):
+        """
+        Return the current player.
+
+        Returns:
+            The current player, it should be an element of the players list in the config.
+        """
+        return self.env.to_play()
+
+    def legal_actions(self):
+        """
+        Should return the legal actions at each turn, if it is not available, it can return
+        the whole action space. At each turn, the game have to be able to handle one of returned actions.
+
+        For complex game where calculating legal moves is too long, the idea is to define the legal actions
+        equal to the action space but to return a negative reward if the action is illegal.
+
+        Returns:
+            An array of integers, subset of the action space.
+        """
+        return self.env.legal_actions()
+
+    def reset(self):
+        """
+        Reset the game for a new game.
+
+        Returns:
+            Initial observation of the game.
+        """
+        return self.env.reset()
+
+    def render(self):
+        """
+        Display the game observation.
+        """
+        self.env.render()
+        input("Press enter to take a step ")
+
+    def human_to_action(self):
+        """
+        For multiplayer games, ask the user for a legal action
+        and return the corresponding action number.
+
+        Returns:
+            An integer from the action space.
+        """
+        while True:
+            try:
+                row = int(
+                    input(
+                        f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: "
+                    )
+                )
+                col = int(
+                    input(
+                        f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: "
+                    )
+                )
+                choice = (row - 1) * 3 + (col - 1)
+                if (
+                    choice in self.legal_actions()
+                    and 1 <= row
+                    and 1 <= col
+                    and row <= 3
+                    and col <= 3
+                ):
+                    break
+            except:
+                pass
+            print("Wrong input, try again")
+        return choice
+
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        return self.env.expert_action()
+
+    def action_to_string(self, action_number):
+        """
+        Convert an action number to a string representing the action.
+
+        Args:
+            action_number: an integer from the action space.
+
+        Returns:
+            String representing the action.
+        """
+        row = action_number // 3 + 1
+        col = action_number % 3 + 1
+        return f"Play row {row}, column {col}"
+
+
+class TicTacToe:
+    def __init__(self):
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 1
+
+    def to_play(self):
+        return 0 if self.player == 1 else 1
+
+    def reset(self):
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 1
+        return self.get_observation()
+
+    def step(self, action):
+        row = action // 3
+        col = action % 3
+        self.board[row, col] = self.player
+
+        done = self.have_winner() or len(self.legal_actions()) == 0
+
+        reward = 1 if self.have_winner() else 0
+
+        self.player *= -1
+
+        return self.get_observation(), reward, done
+
+    def get_observation(self):
+        board_player1 = numpy.where(self.board == 1, 1, 0)
+        board_player2 = numpy.where(self.board == -1, 1, 0)
+        board_to_play = numpy.full((3, 3), self.player)
+        return numpy.array([board_player1, board_player2, board_to_play], dtype="int32")
+
+    def legal_actions(self):
+        legal = []
+        for i in range(9):
+            row = i // 3
+            col = i % 3
+            if self.board[row, col] == 0:
+                legal.append(i)
+        return legal
+
+    def have_winner(self):
+        # Horizontal and vertical checks
+        for i in range(3):
+            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+
+        # Diagonal checks
+        if (
+            self.board[0, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[2, 2] == self.player
+        ):
+            return True
+        if (
+            self.board[2, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[0, 2] == self.player
+        ):
+            return True
+
+        return False
+
+    def expert_action(self):
+        board = self.board
+        action = numpy.random.choice(self.legal_actions())
+        # Horizontal and vertical checks
+        for i in range(3):
+            if abs(sum(board[i, :])) == 2:
+                ind = numpy.where(board[i, :] == 0)[0][0]
+                action = numpy.ravel_multi_index(
+                    (numpy.array([i]), numpy.array([ind])), (3, 3)
+                )[0]
+                if self.player * sum(board[i, :]) > 0:
+                    return action
+
+            if abs(sum(board[:, i])) == 2:
+                ind = numpy.where(board[:, i] == 0)[0][0]
+                action = numpy.ravel_multi_index(
+                    (numpy.array([ind]), numpy.array([i])), (3, 3)
+                )[0]
+                if self.player * sum(board[:, i]) > 0:
+                    return action
+
+        # Diagonal checks
+        diag = board.diagonal()
+        anti_diag = numpy.fliplr(board).diagonal()
+        if abs(sum(diag)) == 2:
+            ind = numpy.where(diag == 0)[0][0]
+            action = numpy.ravel_multi_index(
+                (numpy.array([ind]), numpy.array([ind])), (3, 3)
+            )[0]
+            if self.player * sum(diag) > 0:
+                return action
+
+        if abs(sum(anti_diag)) == 2:
+            ind = numpy.where(anti_diag == 0)[0][0]
+            action = numpy.ravel_multi_index(
+                (numpy.array([ind]), numpy.array([2 - ind])), (3, 3)
+            )[0]
+            if self.player * sum(anti_diag) > 0:
+                return action
+
+        return action
+
+    def render(self):
+        print(self.board[::-1])
diff --git a/games/tictactoe3.py b/games/tictactoe3.py
new file mode 100644
index 00000000..1078bff0
--- /dev/null
+++ b/games/tictactoe3.py
@@ -0,0 +1,354 @@
+import datetime
+import pathlib
+
+import numpy
+import torch
+
+from .abstract_game import AbstractGame
+
+
+class MuZeroConfig:
+    def __init__(self):
+        # fmt: off
+        # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
+
+        self.seed = 0  # Seed for numpy, torch and the game
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
+
+
+
+        ### Game
+        self.observation_shape = (3, 3, 3)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
+        self.action_space = list(range(9))  # Fixed list of all possible actions. You should only edit the length
+        self.players = list(range(2))  # List of players. You should only edit the length
+        self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
+
+        # Evaluate
+        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
+        self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
+
+        # 动作是否能重复
+        self.action_replace = False
+
+        ### Self-Play
+        self.num_workers = 1  # Number of simultaneous threads/workers self-playing to feed the replay buffer
+        self.selfplay_on_gpu = False
+        self.max_moves = 9  # Maximum number of moves if game is not finished before
+        self.num_simulations = 25  # Number of future moves self-simulated
+        self.discount = 1  # Chronological discount of the reward
+        self.temperature_threshold = None  # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time
+
+        # Root prior exploration noise
+        self.root_dirichlet_alpha = 0.1
+        self.root_exploration_fraction = 0.25
+
+        # UCB formula
+        self.pb_c_base = 19652
+        self.pb_c_init = 1.25
+
+
+
+        ### Network
+        self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.network = "fullyconnected"
+        self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
+
+        # Residual Network
+        self.downsample = False  # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture)
+        self.blocks = 1  # Number of blocks in the ResNet
+        self.channels = 16  # Number of channels in the ResNet
+        self.reduced_channels_reward = 16  # Number of channels in reward head
+        self.reduced_channels_value = 16  # Number of channels in value head
+        self.reduced_channels_policy = 16  # Number of channels in policy head
+        self.resnet_fc_reward_layers = [8]  # Define the hidden layers in the reward head of the dynamic network
+        self.resnet_fc_value_layers = [8]  # Define the hidden layers in the value head of the prediction network
+        self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
+
+        # Fully Connected Network
+        self.encoding_size = 32
+        self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
+        self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
+        self.fc_value_layers = []  # Define the hidden layers in the value network
+        self.fc_policy_layers = []  # Define the hidden layers in the policy network
+
+
+
+        ### Training
+        self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
+        self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
+        self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        # self.training_steps = 50000
+        self.batch_size = 64  # Number of parts of games to train on at each training step
+        self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
+        self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
+        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
+
+        self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
+        self.weight_decay = 1e-4  # L2 weights regularization
+        self.momentum = 0.9  # Used only if optimizer is SGD
+
+        # Exponential learning rate schedule
+        self.lr_init = 0.003  # Initial learning rate
+        self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
+        self.lr_decay_steps = 10000
+
+
+
+        ### Replay Buffer
+        self.replay_buffer_size = 3000  # Number of self-play games to keep in the replay buffer
+        self.num_unroll_steps = 20  # Number of game moves to keep for every batch element
+        self.td_steps = 20  # Number of steps in the future to take into account for calculating the target value
+        self.PER = True  # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
+
+        # Reanalyze (See paper appendix Reanalyse)
+        self.use_last_model_value = True  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+        self.reanalyse_on_gpu = False
+
+
+
+        ### Adjust the self play / training ratio to avoid over/underfitting
+        self.self_play_delay = 0  # Number of seconds to wait after each played game
+        self.training_delay = 0  # Number of seconds to wait after each training step
+        self.ratio = None  # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
+        # fmt: on
+
+    def visit_softmax_temperature_fn(self, trained_steps):
+        """
+        Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.
+        The smaller it is, the more likely the best action (ie with the highest visit count) is chosen.
+
+        Returns:
+            Positive float.
+        """
+        return 1
+
+
+class Game(AbstractGame):
+    """
+    Game wrapper.
+    """
+
+    def __init__(self, seed=None):
+        self.env = TicTacToe()
+
+    def step(self, action):
+        """
+        Apply action to the game.
+
+        Args:
+            action : action of the action_space to take.
+
+        Returns:
+            The new observation, the reward and a boolean if the game has ended.
+        """
+        observation, reward, done = self.env.step(action)
+        return observation, reward * 20, done
+
+    def to_play(self):
+        """
+        Return the current player.
+
+        Returns:
+            The current player, it should be an element of the players list in the config.
+        """
+        return self.env.to_play()
+
+    def legal_actions(self):
+        """
+        Should return the legal actions at each turn, if it is not available, it can return
+        the whole action space. At each turn, the game have to be able to handle one of returned actions.
+
+        For complex game where calculating legal moves is too long, the idea is to define the legal actions
+        equal to the action space but to return a negative reward if the action is illegal.
+
+        Returns:
+            An array of integers, subset of the action space.
+        """
+        return self.env.legal_actions()
+
+    def reset(self):
+        """
+        Reset the game for a new game.
+
+        Returns:
+            Initial observation of the game.
+        """
+        return self.env.reset()
+
+    def render(self):
+        """
+        Display the game observation.
+        """
+        self.env.render()
+        input("Press enter to take a step ")
+
+    def human_to_action(self):
+        """
+        For multiplayer games, ask the user for a legal action
+        and return the corresponding action number.
+
+        Returns:
+            An integer from the action space.
+        """
+        while True:
+            try:
+                row = int(
+                    input(
+                        f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: "
+                    )
+                )
+                col = int(
+                    input(
+                        f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: "
+                    )
+                )
+                choice = (row - 1) * 3 + (col - 1)
+                if (
+                    choice in self.legal_actions()
+                    and 1 <= row
+                    and 1 <= col
+                    and row <= 3
+                    and col <= 3
+                ):
+                    break
+            except:
+                pass
+            print("Wrong input, try again")
+        return choice
+
+    def expert_agent(self):
+        """
+        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
+        It doesn't influence training
+
+        Returns:
+            Action as an integer to take in the current game state
+        """
+        return self.env.expert_action()
+
+    def action_to_string(self, action_number):
+        """
+        Convert an action number to a string representing the action.
+
+        Args:
+            action_number: an integer from the action space.
+
+        Returns:
+            String representing the action.
+        """
+        row = action_number // 3 + 1
+        col = action_number % 3 + 1
+        return f"Play row {row}, column {col}"
+
+
+class TicTacToe:
+    def __init__(self):
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 1
+
+    def to_play(self):
+        return 0 if self.player == 1 else 1
+
+    def reset(self):
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 1
+        return self.get_observation()
+
+    def step(self, action):
+        row = action // 3
+        col = action % 3
+        self.board[row, col] = self.player
+
+        done = self.have_winner() or len(self.legal_actions()) == 0
+
+        reward = 1 if self.have_winner() else 0
+
+        self.player *= -1
+
+        return self.get_observation(), reward, done
+
+    def get_observation(self):
+        board_player1 = numpy.where(self.board == 1, 1, 0)
+        board_player2 = numpy.where(self.board == -1, 1, 0)
+        board_to_play = numpy.full((3, 3), self.player)
+        return numpy.array([board_player1, board_player2, board_to_play], dtype="int32")
+
+    def legal_actions(self):
+        legal = []
+        for i in range(9):
+            row = i // 3
+            col = i % 3
+            if self.board[row, col] == 0:
+                legal.append(i)
+        return legal
+
+    def have_winner(self):
+        # Horizontal and vertical checks
+        for i in range(3):
+            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+
+        # Diagonal checks
+        if (
+            self.board[0, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[2, 2] == self.player
+        ):
+            return True
+        if (
+            self.board[2, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[0, 2] == self.player
+        ):
+            return True
+
+        return False
+
+    def expert_action(self):
+        board = self.board
+        action = numpy.random.choice(self.legal_actions())
+        # Horizontal and vertical checks
+        for i in range(3):
+            if abs(sum(board[i, :])) == 2:
+                ind = numpy.where(board[i, :] == 0)[0][0]
+                action = numpy.ravel_multi_index(
+                    (numpy.array([i]), numpy.array([ind])), (3, 3)
+                )[0]
+                if self.player * sum(board[i, :]) > 0:
+                    return action
+
+            if abs(sum(board[:, i])) == 2:
+                ind = numpy.where(board[:, i] == 0)[0][0]
+                action = numpy.ravel_multi_index(
+                    (numpy.array([ind]), numpy.array([i])), (3, 3)
+                )[0]
+                if self.player * sum(board[:, i]) > 0:
+                    return action
+
+        # Diagonal checks
+        diag = board.diagonal()
+        anti_diag = numpy.fliplr(board).diagonal()
+        if abs(sum(diag)) == 2:
+            ind = numpy.where(diag == 0)[0][0]
+            action = numpy.ravel_multi_index(
+                (numpy.array([ind]), numpy.array([ind])), (3, 3)
+            )[0]
+            if self.player * sum(diag) > 0:
+                return action
+
+        if abs(sum(anti_diag)) == 2:
+            ind = numpy.where(anti_diag == 0)[0][0]
+            action = numpy.ravel_multi_index(
+                (numpy.array([ind]), numpy.array([2 - ind])), (3, 3)
+            )[0]
+            if self.player * sum(anti_diag) > 0:
+                return action
+
+        return action
+
+    def render(self):
+        print(self.board[::-1])

From 2747fdcc7fb7d6a7f672d60b3e32c37b1c80b7c4 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Fri, 25 Aug 2023 02:33:49 +0100
Subject: [PATCH 8/9] organize files

---
 game_tournament.py                           |  56 +--
 game_tournament2.py                          | 389 ------------------
 game_tournament3.py                          | 390 -------------------
 games/tictactoe.py                           |  20 +-
 games/tictactoe2.py                          | 361 -----------------
 games/tictactoe3.py                          | 354 -----------------
 muzero_2net.py                               |   1 -
 muzero_general.py                            |   5 +-
 simplifiedMuZero/without_rb/trainer_no_PV.py | 243 ++++++++++++
 9 files changed, 292 insertions(+), 1527 deletions(-)
 delete mode 100644 game_tournament2.py
 delete mode 100644 game_tournament3.py
 delete mode 100644 games/tictactoe2.py
 delete mode 100644 games/tictactoe3.py
 create mode 100644 simplifiedMuZero/without_rb/trainer_no_PV.py

diff --git a/game_tournament.py b/game_tournament.py
index 8c87e7ef..81b1e363 100644
--- a/game_tournament.py
+++ b/game_tournament.py
@@ -338,49 +338,55 @@ def load_model(model_cls, model_path, config):
 if __name__ == "__main__":
     config = MuZeroConfig()
 
+    # config.network = "fullyconnected"
     # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
-    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--09-40-26\model.checkpoint"
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint"
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--17-12-53\model.checkpoint"
     muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config)
 
     # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
     # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
 
+    config2 = MuZeroConfig()
+    # config2.network = "resnet"
     # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
-    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
-    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config)
-
-    uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
-    uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
-
-    without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
-    without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
-
-    muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
-
-
-    simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
-
-    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
-    # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
+    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-24--02-55-21\muzero_2net\model.checkpoint"
+    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2)
+
+    # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
+    # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
+    #
+    # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
+    # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
+    #
+    # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
+    #
+    #
+    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
+    #
+    # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
+    # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
 
 
     game_tournament = GameTournament(config)
 
     models = [
         {"name":"muzero_2net", "model":muzero_2net_model},
-        {"name":"uniform", "model":uniform_model},
+        # {"name":"uniform", "model":uniform_model},
         {"name":"muzero", "model":muzero_model},
-        {"name": "without_rb", "model": without_rb_model},
-        {"name": "no policy value", "model": muzero_no_policy_model},
-        {"name": "simplified_muzero", "model": without_rb_model},
+        # {"name": "muzero2", "model": muzero_model},
+        # {"name": "without_rb", "model": without_rb_model},
+        # {"name": "no policy value", "model": muzero_no_policy_model},
+        # {"name": "simplified_muzero", "model": without_rb_model},
     ]
 
 
     # game_tournament.play_tournament(models, rollnum=1000)
-    game_tournament.play_tournament(models, rollnum=10)
-    game_tournament.play_tournament_with_expert(models, rollnum=100)
+    # game_tournament.play_tournament(models, rollnum=1000)
+    game_tournament.play_tournament_with_expert(models, rollnum=500)
 
     game_tournament.close_game()
 
diff --git a/game_tournament2.py b/game_tournament2.py
deleted file mode 100644
index cfdd56d6..00000000
--- a/game_tournament2.py
+++ /dev/null
@@ -1,389 +0,0 @@
-import pickle
-
-import torch
-import copy
-import numpy
-
-from games.tictactoe import MuZeroConfig, Game
-import models
-import simplifiedMuZero.net2.models2 as models2
-from self_play import MCTS, GameHistory,SelfPlay
-
-class GameTournament:
-    def __init__(self, config:MuZeroConfig):
-        self.models = []
-        self.game = Game(config.seed)
-        self.config = config
-        self.board = numpy.zeros((3, 3), dtype="int32")
-        self.player = 0
-
-    def have_winner(self):
-        # Horizontal and vertical checks
-        for i in range(3):
-            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-
-        # Diagonal checks
-        if (
-            self.board[0, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[2, 2] == self.player
-        ):
-            return True
-        if (
-            self.board[2, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[0, 2] == self.player
-        ):
-            return True
-
-        return False
-
-    def play_competition(self, model1, search_policy1, model2, search_policy2):
-        game_history = GameHistory()
-
-        observation = self.game.reset()
-
-        game_history.action_history.append(0)
-        game_history.observation_history.append(observation)  # 添加reset之后的observation
-        game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
-
-        done = False
-
-        model1.eval()
-        model2.eval()
-
-        is_model1 = True
-        while not done:
-            assert (
-                    len(numpy.array(observation).shape) == 3
-            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
-            assert (
-                    numpy.array(observation).shape == self.config.observation_shape
-            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
-            stacked_observations = game_history.get_stacked_observations(
-                -1, self.config.stacked_observations, len(self.config.action_space)
-            )
-
-            model = model1 if is_model1 else model2
-            search_policy = search_policy1 if is_model1 else search_policy2
-
-            root, mcts_info = search_policy(self.config).run(
-                model,
-                stacked_observations,
-                self.game.legal_actions(),
-                self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
-                True,
-            )
-
-            action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
-            observation, reward, done = self.game.step(action)
-
-            game_history.store_search_statistics(root, self.config.action_space)
-
-            # Next batch
-            game_history.action_history.append(action)
-            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
-            game_history.reward_history.append(reward)
-            game_history.to_play_history.append(self.game.to_play())
-
-            # 如果没有结束，就取反
-            if not done:
-                is_model1 = not is_model1
-
-            # print("is model",is_model1,  "reward is ", reward)
-
-        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
-        self.game.env.player *= -1
-
-        # 返回值处理
-        # |-----|-----|-----|
-        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
-        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
-        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
-        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
-        return self.game.env.have_winner(), is_model1 == (reward > 0)
-
-    def play_with_expert(self, model, search_policy, expert_first=True):
-        game_history = GameHistory()
-
-        observation = self.game.reset()
-
-        game_history.action_history.append(0)
-        game_history.observation_history.append(observation)  # 添加reset之后的observation
-        game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
-
-        done = False
-
-        model.eval()
-
-        is_model = not expert_first
-        while not done:
-            assert (
-                    len(numpy.array(observation).shape) == 3
-            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
-            assert (
-                    numpy.array(observation).shape == self.config.observation_shape
-            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
-            stacked_observations = game_history.get_stacked_observations(
-                -1, self.config.stacked_observations, len(self.config.action_space)
-            )
-
-
-            if is_model:
-                root, mcts_info = search_policy(self.config).run(
-                    model,
-                    stacked_observations,
-                    self.game.legal_actions(),
-                    self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
-                    True,
-                )
-                action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
-            else:
-                action = self.game.expert_agent()
-                root = None
-
-            observation, reward, done = self.game.step(action)
-
-            game_history.store_search_statistics(root, self.config.action_space)
-
-            # Next batch
-            game_history.action_history.append(action)
-            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
-            game_history.reward_history.append(reward)
-            game_history.to_play_history.append(self.game.to_play())
-
-            # 如果没有结束，就取反
-            if not done:
-                is_model = not is_model
-
-            # print("is model",is_model1,  "reward is ", reward)
-
-        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
-        self.game.env.player *= -1
-
-        # 返回值处理
-        # |-----|-----|-----|
-        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
-        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
-        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
-        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
-        return self.game.env.have_winner(), is_model == (reward > 0)
-
-    def close_game(self):
-        self.game.close()
-
-    def play_tournament(self, models, rollnum=1000):
-        model_num = len(models)
-
-        for i in range(model_num):
-            for j in range(i+1, model_num):
-                model1 = models[i]["model"]
-                model2 = models[j]["model"]
-
-                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
-                model1_win_num = 0
-                model2_win_num = 0
-                no_winner_num = 0
-
-                for _ in range(rollnum):
-                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
-
-                    if have_winner:
-                        if is_model1:
-                            model1_win_num += 1
-                        else:
-                            model2_win_num += 1
-                    else:
-                        no_winner_num += 1
-
-                #  # 交换顺序，再来一遍
-                # for _ in range(rollnum):
-                #     have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS)
-                #
-                #     if have_winner:
-                #         if is_model1:
-                #             model2_win_num += 1
-                #         else:
-                #             model1_win_num += 1
-                #     else:
-                #         no_winner_num += 1
-
-                # print(is_model1)
-
-                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
-
-                print(models[i]["name"], " win  :   ", model1_win_num)
-                print(models[j]["name"], " win  :   ", model2_win_num)
-                print("No Winner", no_winner_num)
-                print("===================================")
-
-        model1_win_num = 0
-        model2_win_num = 0
-        no_winner_num = 0
-        for i in range(model_num):
-            for j in range(i+1, model_num):
-                model1 = models[i]["model"]
-                model2 = models[j]["model"]
-
-                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
-                model1_win_num = 0
-                model2_win_num = 0
-                no_winner_num = 0
-
-                for _ in range(rollnum):
-                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
-
-                    if have_winner:
-                        if is_model1:
-                            model1_win_num += 1
-                        else:
-                            model2_win_num += 1
-                    else:
-                        no_winner_num += 1
-
-
-                print(models[j]["name"],"   ,", models[i]["name"]," :   ")
-
-                print(models[j]["name"], " win  :   ", model1_win_num)
-                print(models[i]["name"], " win  :   ", model2_win_num)
-                print("No Winner", no_winner_num)
-                print("===================================")
-
-    def play_tournament_with_expert(self, models, rollnum=1000):
-        model_num = len(models)
-
-        for i in range(model_num):
-            model = models[i]["model"]
-
-            # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
-            model_win_num = 0
-            expert_win_num = 0
-            no_winner_num = 0
-
-            for _ in range(rollnum):
-                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
-
-                if have_winner:
-                    if is_model:
-                        model_win_num += 1
-                    else:
-                        expert_win_num += 1
-                else:
-                    no_winner_num += 1
-
-                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
-                #
-                # if have_winner:
-                #     if is_model:
-                #         model_win_num += 1
-                #     else:
-                #         expert_win_num += 1
-                # else:
-                #     no_winner_num += 1
-
-
-            print(models[i]["name"], "   ,", "expert :   ")
-
-            print(models[i]["name"], " win  :   ", model_win_num)
-            print("expert win  :   ", expert_win_num)
-            print("No Winner", no_winner_num)
-            print("===================================")
-
-            model_win_num = 0
-            expert_win_num = 0
-            no_winner_num = 0
-            for _ in range(rollnum):
-                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
-                #
-                # if have_winner:
-                #     if is_model:
-                #         model_win_num += 1
-                #     else:
-                #         expert_win_num += 1
-                # else:
-                #     no_winner_num += 1
-
-                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
-
-                if have_winner:
-                    if is_model:
-                        model_win_num += 1
-                    else:
-                        expert_win_num += 1
-                else:
-                    no_winner_num += 1
-
-            print("expert :   ", "   ,", models[i]["name"])
-
-            print("expert win  :   ", expert_win_num)
-            print(models[i]["name"], " win  :   ", model_win_num)
-            print("No Winner", no_winner_num)
-            print("===================================")
-
-
-
-def load_model(model_cls, model_path, config):
-    checkpoint = torch.load(model_path)
-    model = model_cls(config)
-    model.set_weights(checkpoint["weights"])
-
-    return model
-
-
-if __name__ == "__main__":
-    config = MuZeroConfig()
-
-    config.network = "fullyconnected"
-    # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
-    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-23--14-25-59\model.checkpoint"
-    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config)
-
-    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
-
-    config2 = MuZeroConfig()
-    config2.network = "resnet"
-    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
-    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
-    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2)
-
-    # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
-    # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
-    #
-    # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
-    # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
-    #
-    # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
-    #
-    #
-    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
-    #
-    # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
-    # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
-
-
-    game_tournament = GameTournament(config)
-
-    models = [
-        {"name":"muzero_2net", "model":muzero_2net_model},
-        # {"name":"uniform", "model":uniform_model},
-        {"name":"muzero", "model":muzero_model},
-        # {"name": "without_rb", "model": without_rb_model},
-        # {"name": "no policy value", "model": muzero_no_policy_model},
-        # {"name": "simplified_muzero", "model": without_rb_model},
-    ]
-
-
-    # game_tournament.play_tournament(models, rollnum=1000)
-    game_tournament.play_tournament(models, rollnum=10)
-    game_tournament.play_tournament_with_expert(models, rollnum=100)
-
-    game_tournament.close_game()
-
diff --git a/game_tournament3.py b/game_tournament3.py
deleted file mode 100644
index 14d1dec7..00000000
--- a/game_tournament3.py
+++ /dev/null
@@ -1,390 +0,0 @@
-import pickle
-
-import torch
-import copy
-import numpy
-
-from games.tictactoe2 import MuZeroConfig, Game
-import models
-import simplifiedMuZero.net2.models2 as models2
-from self_play import MCTS, GameHistory,SelfPlay
-
-class GameTournament:
-    def __init__(self, config:MuZeroConfig):
-        self.models = []
-        self.game = Game(config.seed)
-        self.config = config
-        self.board = numpy.zeros((3, 3), dtype="int32")
-        self.player = 0
-
-    def have_winner(self):
-        # Horizontal and vertical checks
-        for i in range(3):
-            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-
-        # Diagonal checks
-        if (
-            self.board[0, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[2, 2] == self.player
-        ):
-            return True
-        if (
-            self.board[2, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[0, 2] == self.player
-        ):
-            return True
-
-        return False
-
-    def play_competition(self, model1, search_policy1, model2, search_policy2):
-        game_history = GameHistory()
-
-        observation = self.game.reset()
-
-        game_history.action_history.append(0)
-        game_history.observation_history.append(observation)  # 添加reset之后的observation
-        game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
-
-        done = False
-
-        model1.eval()
-        model2.eval()
-
-        is_model1 = True
-        while not done:
-            assert (
-                    len(numpy.array(observation).shape) == 3
-            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
-            assert (
-                    numpy.array(observation).shape == self.config.observation_shape
-            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
-            stacked_observations = game_history.get_stacked_observations(
-                -1, self.config.stacked_observations, len(self.config.action_space)
-            )
-
-            model = model1 if is_model1 else model2
-            search_policy = search_policy1 if is_model1 else search_policy2
-
-            root, mcts_info = search_policy(self.config).run(
-                model,
-                stacked_observations,
-                self.game.legal_actions(),
-                self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
-                True,
-            )
-
-            action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
-            observation, reward, done = self.game.step(action)
-
-            game_history.store_search_statistics(root, self.config.action_space)
-
-            # Next batch
-            game_history.action_history.append(action)
-            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
-            game_history.reward_history.append(reward)
-            game_history.to_play_history.append(self.game.to_play())
-
-            # 如果没有结束，就取反
-            if not done:
-                is_model1 = not is_model1
-
-            # print("is model",is_model1,  "reward is ", reward)
-
-        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
-        self.game.env.player *= -1
-
-        # 返回值处理
-        # |-----|-----|-----|
-        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
-        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
-        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
-        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
-        return self.game.env.have_winner(), is_model1 == (reward > 0)
-
-    def play_with_expert(self, model, search_policy, expert_first=True):
-        game_history = GameHistory()
-
-        observation = self.game.reset()
-
-        game_history.action_history.append(0)
-        game_history.observation_history.append(observation)  # 添加reset之后的observation
-        game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
-
-        done = False
-
-        model.eval()
-
-        is_model = not expert_first
-        while not done:
-            assert (
-                    len(numpy.array(observation).shape) == 3
-            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
-            assert (
-                    numpy.array(observation).shape == self.config.observation_shape
-            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
-            stacked_observations = game_history.get_stacked_observations(
-                -1, self.config.stacked_observations, len(self.config.action_space)
-            )
-
-
-            if is_model:
-                root, mcts_info = search_policy(self.config).run(
-                    model,
-                    stacked_observations,
-                    self.game.legal_actions(),
-                    self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
-                    True,
-                )
-                action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
-            else:
-                action = self.game.expert_agent()
-                root = None
-
-            observation, reward, done = self.game.step(action)
-
-            game_history.store_search_statistics(root, self.config.action_space)
-
-            # Next batch
-            game_history.action_history.append(action)
-            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
-            game_history.reward_history.append(reward)
-            game_history.to_play_history.append(self.game.to_play())
-
-            # 如果没有结束，就取反
-            if not done:
-                is_model = not is_model
-
-            # print("is model",is_model1,  "reward is ", reward)
-
-        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
-        self.game.env.player *= -1
-
-        # 返回值处理
-        # |-----|-----|-----|
-        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
-        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
-        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
-        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
-        return self.game.env.have_winner(), is_model == (reward > 0)
-
-    def close_game(self):
-        self.game.close()
-
-    def play_tournament(self, models, rollnum=1000):
-        model_num = len(models)
-
-        for i in range(model_num):
-            for j in range(i+1, model_num):
-                model1 = models[i]["model"]
-                model2 = models[j]["model"]
-
-                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
-                model1_win_num = 0
-                model2_win_num = 0
-                no_winner_num = 0
-
-                for _ in range(rollnum):
-                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
-
-                    if have_winner:
-                        if is_model1:
-                            model1_win_num += 1
-                        else:
-                            model2_win_num += 1
-                    else:
-                        no_winner_num += 1
-
-                #  # 交换顺序，再来一遍
-                # for _ in range(rollnum):
-                #     have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS)
-                #
-                #     if have_winner:
-                #         if is_model1:
-                #             model2_win_num += 1
-                #         else:
-                #             model1_win_num += 1
-                #     else:
-                #         no_winner_num += 1
-
-                # print(is_model1)
-
-                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
-
-                print(models[i]["name"], " win  :   ", model1_win_num)
-                print(models[j]["name"], " win  :   ", model2_win_num)
-                print("No Winner", no_winner_num)
-                print("===================================")
-
-        model1_win_num = 0
-        model2_win_num = 0
-        no_winner_num = 0
-        for i in range(model_num):
-            for j in range(i+1, model_num):
-                model1 = models[i]["model"]
-                model2 = models[j]["model"]
-
-                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
-                model1_win_num = 0
-                model2_win_num = 0
-                no_winner_num = 0
-
-                for _ in range(rollnum):
-                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
-
-                    if have_winner:
-                        if is_model1:
-                            model1_win_num += 1
-                        else:
-                            model2_win_num += 1
-                    else:
-                        no_winner_num += 1
-
-
-                print(models[j]["name"],"   ,", models[i]["name"]," :   ")
-
-                print(models[j]["name"], " win  :   ", model1_win_num)
-                print(models[i]["name"], " win  :   ", model2_win_num)
-                print("No Winner", no_winner_num)
-                print("===================================")
-
-    def play_tournament_with_expert(self, models, rollnum=1000):
-        model_num = len(models)
-
-        for i in range(model_num):
-            model = models[i]["model"]
-
-            # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
-            model_win_num = 0
-            expert_win_num = 0
-            no_winner_num = 0
-
-            for _ in range(rollnum):
-                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
-
-                if have_winner:
-                    if is_model:
-                        model_win_num += 1
-                    else:
-                        expert_win_num += 1
-                else:
-                    no_winner_num += 1
-
-                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
-                #
-                # if have_winner:
-                #     if is_model:
-                #         model_win_num += 1
-                #     else:
-                #         expert_win_num += 1
-                # else:
-                #     no_winner_num += 1
-
-
-            print(models[i]["name"], "   ,", "expert :   ")
-
-            print(models[i]["name"], " win  :   ", model_win_num)
-            print("expert win  :   ", expert_win_num)
-            print("No Winner", no_winner_num)
-            print("===================================")
-
-            model_win_num = 0
-            expert_win_num = 0
-            no_winner_num = 0
-            for _ in range(rollnum):
-                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
-                #
-                # if have_winner:
-                #     if is_model:
-                #         model_win_num += 1
-                #     else:
-                #         expert_win_num += 1
-                # else:
-                #     no_winner_num += 1
-
-                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
-
-                if have_winner:
-                    if is_model:
-                        model_win_num += 1
-                    else:
-                        expert_win_num += 1
-                else:
-                    no_winner_num += 1
-
-            print("expert :   ", "   ,", models[i]["name"])
-
-            print("expert win  :   ", expert_win_num)
-            print(models[i]["name"], " win  :   ", model_win_num)
-            print("No Winner", no_winner_num)
-            print("===================================")
-
-
-
-def load_model(model_cls, model_path, config):
-    checkpoint = torch.load(model_path)
-    model = model_cls(config)
-    model.set_weights(checkpoint["weights"])
-
-    return model
-
-
-if __name__ == "__main__":
-    config = MuZeroConfig()
-
-    # config.network = "fullyconnected"
-    # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
-    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint"
-    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config)
-
-    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
-
-    config2 = MuZeroConfig()
-    config2.network = "resnet"
-    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
-    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
-    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2)
-
-    # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
-    # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
-    #
-    # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
-    # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
-    #
-    # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
-    #
-    #
-    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
-    # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
-    #
-    # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
-    # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
-
-
-    game_tournament = GameTournament(config)
-
-    models = [
-        {"name":"muzero_2net", "model":muzero_2net_model},
-        # {"name":"uniform", "model":uniform_model},
-        {"name":"muzero", "model":muzero_model},
-        {"name": "muzero2", "model": muzero_model},
-        # {"name": "without_rb", "model": without_rb_model},
-        # {"name": "no policy value", "model": muzero_no_policy_model},
-        # {"name": "simplified_muzero", "model": without_rb_model},
-    ]
-
-
-    # game_tournament.play_tournament(models, rollnum=1000)
-    game_tournament.play_tournament(models, rollnum=10)
-    game_tournament.play_tournament_with_expert(models, rollnum=10)
-
-    game_tournament.close_game()
-
diff --git a/games/tictactoe.py b/games/tictactoe.py
index 787986fb..ff9a90bf 100644
--- a/games/tictactoe.py
+++ b/games/tictactoe.py
@@ -49,7 +49,8 @@ def __init__(self):
 
 
         ### Network
-        self.network = "resnet"  # "resnet" / "fullyconnected"
+        # self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.network = "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
 
         # Residual Network
@@ -64,20 +65,27 @@ def __init__(self):
         self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
 
         # Fully Connected Network
+        # self.encoding_size = 32
+        # self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        # self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
+        # self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
+        # self.fc_value_layers = []  # Define the hidden layers in the value network
+        # self.fc_policy_layers = []  # Define the hidden layers in the policy network
+
         self.encoding_size = 32
-        self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        self.fc_representation_layers = [16]  # Define the hidden layers in the representation network
         self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
         self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
-        self.fc_value_layers = []  # Define the hidden layers in the value network
-        self.fc_policy_layers = []  # Define the hidden layers in the policy network
-
+        self.fc_value_layers = [16]  # Define the hidden layers in the value network
+        self.fc_policy_layers = [16]
 
 
         ### Training
         self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
         self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
         # self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
-        self.training_steps = 50000
+        # self.training_steps = 50000
+        self.training_steps = 500000
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
diff --git a/games/tictactoe2.py b/games/tictactoe2.py
deleted file mode 100644
index ff9a90bf..00000000
--- a/games/tictactoe2.py
+++ /dev/null
@@ -1,361 +0,0 @@
-import datetime
-import pathlib
-
-import numpy
-import torch
-
-from .abstract_game import AbstractGame
-
-
-class MuZeroConfig:
-    def __init__(self):
-        # fmt: off
-        # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
-
-        self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
-
-
-
-        ### Game
-        self.observation_shape = (3, 3, 3)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
-        self.action_space = list(range(9))  # Fixed list of all possible actions. You should only edit the length
-        self.players = list(range(2))  # List of players. You should only edit the length
-        self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
-
-        # Evaluate
-        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
-        self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
-
-        # 动作是否能重复
-        self.action_replace = False
-
-        ### Self-Play
-        self.num_workers = 1  # Number of simultaneous threads/workers self-playing to feed the replay buffer
-        self.selfplay_on_gpu = False
-        self.max_moves = 9  # Maximum number of moves if game is not finished before
-        self.num_simulations = 25  # Number of future moves self-simulated
-        self.discount = 1  # Chronological discount of the reward
-        self.temperature_threshold = None  # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time
-
-        # Root prior exploration noise
-        self.root_dirichlet_alpha = 0.1
-        self.root_exploration_fraction = 0.25
-
-        # UCB formula
-        self.pb_c_base = 19652
-        self.pb_c_init = 1.25
-
-
-
-        ### Network
-        # self.network = "resnet"  # "resnet" / "fullyconnected"
-        self.network = "fullyconnected"
-        self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
-
-        # Residual Network
-        self.downsample = False  # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture)
-        self.blocks = 1  # Number of blocks in the ResNet
-        self.channels = 16  # Number of channels in the ResNet
-        self.reduced_channels_reward = 16  # Number of channels in reward head
-        self.reduced_channels_value = 16  # Number of channels in value head
-        self.reduced_channels_policy = 16  # Number of channels in policy head
-        self.resnet_fc_reward_layers = [8]  # Define the hidden layers in the reward head of the dynamic network
-        self.resnet_fc_value_layers = [8]  # Define the hidden layers in the value head of the prediction network
-        self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
-
-        # Fully Connected Network
-        # self.encoding_size = 32
-        # self.fc_representation_layers = []  # Define the hidden layers in the representation network
-        # self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
-        # self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
-        # self.fc_value_layers = []  # Define the hidden layers in the value network
-        # self.fc_policy_layers = []  # Define the hidden layers in the policy network
-
-        self.encoding_size = 32
-        self.fc_representation_layers = [16]  # Define the hidden layers in the representation network
-        self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
-        self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
-        self.fc_value_layers = [16]  # Define the hidden layers in the value network
-        self.fc_policy_layers = [16]
-
-
-        ### Training
-        self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
-        self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
-        # self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
-        # self.training_steps = 50000
-        self.training_steps = 500000
-        self.batch_size = 64  # Number of parts of games to train on at each training step
-        self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
-        self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
-
-        self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
-        self.weight_decay = 1e-4  # L2 weights regularization
-        self.momentum = 0.9  # Used only if optimizer is SGD
-
-        # Exponential learning rate schedule
-        self.lr_init = 0.003  # Initial learning rate
-        self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
-        self.lr_decay_steps = 10000
-
-
-
-        ### Replay Buffer
-        self.replay_buffer_size = 3000  # Number of self-play games to keep in the replay buffer
-        self.num_unroll_steps = 20  # Number of game moves to keep for every batch element
-        self.td_steps = 20  # Number of steps in the future to take into account for calculating the target value
-        self.PER = True  # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network
-        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
-
-        # Reanalyze (See paper appendix Reanalyse)
-        self.use_last_model_value = True  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
-        self.reanalyse_on_gpu = False
-
-
-
-        ### Adjust the self play / training ratio to avoid over/underfitting
-        self.self_play_delay = 0  # Number of seconds to wait after each played game
-        self.training_delay = 0  # Number of seconds to wait after each training step
-        self.ratio = None  # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
-        # fmt: on
-
-    def visit_softmax_temperature_fn(self, trained_steps):
-        """
-        Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.
-        The smaller it is, the more likely the best action (ie with the highest visit count) is chosen.
-
-        Returns:
-            Positive float.
-        """
-        return 1
-
-
-class Game(AbstractGame):
-    """
-    Game wrapper.
-    """
-
-    def __init__(self, seed=None):
-        self.env = TicTacToe()
-
-    def step(self, action):
-        """
-        Apply action to the game.
-
-        Args:
-            action : action of the action_space to take.
-
-        Returns:
-            The new observation, the reward and a boolean if the game has ended.
-        """
-        observation, reward, done = self.env.step(action)
-        return observation, reward * 20, done
-
-    def to_play(self):
-        """
-        Return the current player.
-
-        Returns:
-            The current player, it should be an element of the players list in the config.
-        """
-        return self.env.to_play()
-
-    def legal_actions(self):
-        """
-        Should return the legal actions at each turn, if it is not available, it can return
-        the whole action space. At each turn, the game have to be able to handle one of returned actions.
-
-        For complex game where calculating legal moves is too long, the idea is to define the legal actions
-        equal to the action space but to return a negative reward if the action is illegal.
-
-        Returns:
-            An array of integers, subset of the action space.
-        """
-        return self.env.legal_actions()
-
-    def reset(self):
-        """
-        Reset the game for a new game.
-
-        Returns:
-            Initial observation of the game.
-        """
-        return self.env.reset()
-
-    def render(self):
-        """
-        Display the game observation.
-        """
-        self.env.render()
-        input("Press enter to take a step ")
-
-    def human_to_action(self):
-        """
-        For multiplayer games, ask the user for a legal action
-        and return the corresponding action number.
-
-        Returns:
-            An integer from the action space.
-        """
-        while True:
-            try:
-                row = int(
-                    input(
-                        f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: "
-                    )
-                )
-                col = int(
-                    input(
-                        f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: "
-                    )
-                )
-                choice = (row - 1) * 3 + (col - 1)
-                if (
-                    choice in self.legal_actions()
-                    and 1 <= row
-                    and 1 <= col
-                    and row <= 3
-                    and col <= 3
-                ):
-                    break
-            except:
-                pass
-            print("Wrong input, try again")
-        return choice
-
-    def expert_agent(self):
-        """
-        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
-        It doesn't influence training
-
-        Returns:
-            Action as an integer to take in the current game state
-        """
-        return self.env.expert_action()
-
-    def action_to_string(self, action_number):
-        """
-        Convert an action number to a string representing the action.
-
-        Args:
-            action_number: an integer from the action space.
-
-        Returns:
-            String representing the action.
-        """
-        row = action_number // 3 + 1
-        col = action_number % 3 + 1
-        return f"Play row {row}, column {col}"
-
-
-class TicTacToe:
-    def __init__(self):
-        self.board = numpy.zeros((3, 3), dtype="int32")
-        self.player = 1
-
-    def to_play(self):
-        return 0 if self.player == 1 else 1
-
-    def reset(self):
-        self.board = numpy.zeros((3, 3), dtype="int32")
-        self.player = 1
-        return self.get_observation()
-
-    def step(self, action):
-        row = action // 3
-        col = action % 3
-        self.board[row, col] = self.player
-
-        done = self.have_winner() or len(self.legal_actions()) == 0
-
-        reward = 1 if self.have_winner() else 0
-
-        self.player *= -1
-
-        return self.get_observation(), reward, done
-
-    def get_observation(self):
-        board_player1 = numpy.where(self.board == 1, 1, 0)
-        board_player2 = numpy.where(self.board == -1, 1, 0)
-        board_to_play = numpy.full((3, 3), self.player)
-        return numpy.array([board_player1, board_player2, board_to_play], dtype="int32")
-
-    def legal_actions(self):
-        legal = []
-        for i in range(9):
-            row = i // 3
-            col = i % 3
-            if self.board[row, col] == 0:
-                legal.append(i)
-        return legal
-
-    def have_winner(self):
-        # Horizontal and vertical checks
-        for i in range(3):
-            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-
-        # Diagonal checks
-        if (
-            self.board[0, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[2, 2] == self.player
-        ):
-            return True
-        if (
-            self.board[2, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[0, 2] == self.player
-        ):
-            return True
-
-        return False
-
-    def expert_action(self):
-        board = self.board
-        action = numpy.random.choice(self.legal_actions())
-        # Horizontal and vertical checks
-        for i in range(3):
-            if abs(sum(board[i, :])) == 2:
-                ind = numpy.where(board[i, :] == 0)[0][0]
-                action = numpy.ravel_multi_index(
-                    (numpy.array([i]), numpy.array([ind])), (3, 3)
-                )[0]
-                if self.player * sum(board[i, :]) > 0:
-                    return action
-
-            if abs(sum(board[:, i])) == 2:
-                ind = numpy.where(board[:, i] == 0)[0][0]
-                action = numpy.ravel_multi_index(
-                    (numpy.array([ind]), numpy.array([i])), (3, 3)
-                )[0]
-                if self.player * sum(board[:, i]) > 0:
-                    return action
-
-        # Diagonal checks
-        diag = board.diagonal()
-        anti_diag = numpy.fliplr(board).diagonal()
-        if abs(sum(diag)) == 2:
-            ind = numpy.where(diag == 0)[0][0]
-            action = numpy.ravel_multi_index(
-                (numpy.array([ind]), numpy.array([ind])), (3, 3)
-            )[0]
-            if self.player * sum(diag) > 0:
-                return action
-
-        if abs(sum(anti_diag)) == 2:
-            ind = numpy.where(anti_diag == 0)[0][0]
-            action = numpy.ravel_multi_index(
-                (numpy.array([ind]), numpy.array([2 - ind])), (3, 3)
-            )[0]
-            if self.player * sum(anti_diag) > 0:
-                return action
-
-        return action
-
-    def render(self):
-        print(self.board[::-1])
diff --git a/games/tictactoe3.py b/games/tictactoe3.py
deleted file mode 100644
index 1078bff0..00000000
--- a/games/tictactoe3.py
+++ /dev/null
@@ -1,354 +0,0 @@
-import datetime
-import pathlib
-
-import numpy
-import torch
-
-from .abstract_game import AbstractGame
-
-
-class MuZeroConfig:
-    def __init__(self):
-        # fmt: off
-        # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
-
-        self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
-
-
-
-        ### Game
-        self.observation_shape = (3, 3, 3)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
-        self.action_space = list(range(9))  # Fixed list of all possible actions. You should only edit the length
-        self.players = list(range(2))  # List of players. You should only edit the length
-        self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
-
-        # Evaluate
-        self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
-        self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
-
-        # 动作是否能重复
-        self.action_replace = False
-
-        ### Self-Play
-        self.num_workers = 1  # Number of simultaneous threads/workers self-playing to feed the replay buffer
-        self.selfplay_on_gpu = False
-        self.max_moves = 9  # Maximum number of moves if game is not finished before
-        self.num_simulations = 25  # Number of future moves self-simulated
-        self.discount = 1  # Chronological discount of the reward
-        self.temperature_threshold = None  # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time
-
-        # Root prior exploration noise
-        self.root_dirichlet_alpha = 0.1
-        self.root_exploration_fraction = 0.25
-
-        # UCB formula
-        self.pb_c_base = 19652
-        self.pb_c_init = 1.25
-
-
-
-        ### Network
-        self.network = "resnet"  # "resnet" / "fullyconnected"
-        self.network = "fullyconnected"
-        self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
-
-        # Residual Network
-        self.downsample = False  # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture)
-        self.blocks = 1  # Number of blocks in the ResNet
-        self.channels = 16  # Number of channels in the ResNet
-        self.reduced_channels_reward = 16  # Number of channels in reward head
-        self.reduced_channels_value = 16  # Number of channels in value head
-        self.reduced_channels_policy = 16  # Number of channels in policy head
-        self.resnet_fc_reward_layers = [8]  # Define the hidden layers in the reward head of the dynamic network
-        self.resnet_fc_value_layers = [8]  # Define the hidden layers in the value head of the prediction network
-        self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
-
-        # Fully Connected Network
-        self.encoding_size = 32
-        self.fc_representation_layers = []  # Define the hidden layers in the representation network
-        self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
-        self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
-        self.fc_value_layers = []  # Define the hidden layers in the value network
-        self.fc_policy_layers = []  # Define the hidden layers in the policy network
-
-
-
-        ### Training
-        self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
-        self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
-        self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
-        # self.training_steps = 50000
-        self.batch_size = 64  # Number of parts of games to train on at each training step
-        self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
-        self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
-        self.train_on_gpu = torch.cuda.is_available()  # Train on GPU if available
-
-        self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
-        self.weight_decay = 1e-4  # L2 weights regularization
-        self.momentum = 0.9  # Used only if optimizer is SGD
-
-        # Exponential learning rate schedule
-        self.lr_init = 0.003  # Initial learning rate
-        self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
-        self.lr_decay_steps = 10000
-
-
-
-        ### Replay Buffer
-        self.replay_buffer_size = 3000  # Number of self-play games to keep in the replay buffer
-        self.num_unroll_steps = 20  # Number of game moves to keep for every batch element
-        self.td_steps = 20  # Number of steps in the future to take into account for calculating the target value
-        self.PER = True  # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network
-        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
-
-        # Reanalyze (See paper appendix Reanalyse)
-        self.use_last_model_value = True  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
-        self.reanalyse_on_gpu = False
-
-
-
-        ### Adjust the self play / training ratio to avoid over/underfitting
-        self.self_play_delay = 0  # Number of seconds to wait after each played game
-        self.training_delay = 0  # Number of seconds to wait after each training step
-        self.ratio = None  # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
-        # fmt: on
-
-    def visit_softmax_temperature_fn(self, trained_steps):
-        """
-        Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.
-        The smaller it is, the more likely the best action (ie with the highest visit count) is chosen.
-
-        Returns:
-            Positive float.
-        """
-        return 1
-
-
-class Game(AbstractGame):
-    """
-    Game wrapper.
-    """
-
-    def __init__(self, seed=None):
-        self.env = TicTacToe()
-
-    def step(self, action):
-        """
-        Apply action to the game.
-
-        Args:
-            action : action of the action_space to take.
-
-        Returns:
-            The new observation, the reward and a boolean if the game has ended.
-        """
-        observation, reward, done = self.env.step(action)
-        return observation, reward * 20, done
-
-    def to_play(self):
-        """
-        Return the current player.
-
-        Returns:
-            The current player, it should be an element of the players list in the config.
-        """
-        return self.env.to_play()
-
-    def legal_actions(self):
-        """
-        Should return the legal actions at each turn, if it is not available, it can return
-        the whole action space. At each turn, the game have to be able to handle one of returned actions.
-
-        For complex game where calculating legal moves is too long, the idea is to define the legal actions
-        equal to the action space but to return a negative reward if the action is illegal.
-
-        Returns:
-            An array of integers, subset of the action space.
-        """
-        return self.env.legal_actions()
-
-    def reset(self):
-        """
-        Reset the game for a new game.
-
-        Returns:
-            Initial observation of the game.
-        """
-        return self.env.reset()
-
-    def render(self):
-        """
-        Display the game observation.
-        """
-        self.env.render()
-        input("Press enter to take a step ")
-
-    def human_to_action(self):
-        """
-        For multiplayer games, ask the user for a legal action
-        and return the corresponding action number.
-
-        Returns:
-            An integer from the action space.
-        """
-        while True:
-            try:
-                row = int(
-                    input(
-                        f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: "
-                    )
-                )
-                col = int(
-                    input(
-                        f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: "
-                    )
-                )
-                choice = (row - 1) * 3 + (col - 1)
-                if (
-                    choice in self.legal_actions()
-                    and 1 <= row
-                    and 1 <= col
-                    and row <= 3
-                    and col <= 3
-                ):
-                    break
-            except:
-                pass
-            print("Wrong input, try again")
-        return choice
-
-    def expert_agent(self):
-        """
-        Hard coded agent that MuZero faces to assess his progress in multiplayer games.
-        It doesn't influence training
-
-        Returns:
-            Action as an integer to take in the current game state
-        """
-        return self.env.expert_action()
-
-    def action_to_string(self, action_number):
-        """
-        Convert an action number to a string representing the action.
-
-        Args:
-            action_number: an integer from the action space.
-
-        Returns:
-            String representing the action.
-        """
-        row = action_number // 3 + 1
-        col = action_number % 3 + 1
-        return f"Play row {row}, column {col}"
-
-
-class TicTacToe:
-    def __init__(self):
-        self.board = numpy.zeros((3, 3), dtype="int32")
-        self.player = 1
-
-    def to_play(self):
-        return 0 if self.player == 1 else 1
-
-    def reset(self):
-        self.board = numpy.zeros((3, 3), dtype="int32")
-        self.player = 1
-        return self.get_observation()
-
-    def step(self, action):
-        row = action // 3
-        col = action % 3
-        self.board[row, col] = self.player
-
-        done = self.have_winner() or len(self.legal_actions()) == 0
-
-        reward = 1 if self.have_winner() else 0
-
-        self.player *= -1
-
-        return self.get_observation(), reward, done
-
-    def get_observation(self):
-        board_player1 = numpy.where(self.board == 1, 1, 0)
-        board_player2 = numpy.where(self.board == -1, 1, 0)
-        board_to_play = numpy.full((3, 3), self.player)
-        return numpy.array([board_player1, board_player2, board_to_play], dtype="int32")
-
-    def legal_actions(self):
-        legal = []
-        for i in range(9):
-            row = i // 3
-            col = i % 3
-            if self.board[row, col] == 0:
-                legal.append(i)
-        return legal
-
-    def have_winner(self):
-        # Horizontal and vertical checks
-        for i in range(3):
-            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
-                return True
-
-        # Diagonal checks
-        if (
-            self.board[0, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[2, 2] == self.player
-        ):
-            return True
-        if (
-            self.board[2, 0] == self.player
-            and self.board[1, 1] == self.player
-            and self.board[0, 2] == self.player
-        ):
-            return True
-
-        return False
-
-    def expert_action(self):
-        board = self.board
-        action = numpy.random.choice(self.legal_actions())
-        # Horizontal and vertical checks
-        for i in range(3):
-            if abs(sum(board[i, :])) == 2:
-                ind = numpy.where(board[i, :] == 0)[0][0]
-                action = numpy.ravel_multi_index(
-                    (numpy.array([i]), numpy.array([ind])), (3, 3)
-                )[0]
-                if self.player * sum(board[i, :]) > 0:
-                    return action
-
-            if abs(sum(board[:, i])) == 2:
-                ind = numpy.where(board[:, i] == 0)[0][0]
-                action = numpy.ravel_multi_index(
-                    (numpy.array([ind]), numpy.array([i])), (3, 3)
-                )[0]
-                if self.player * sum(board[:, i]) > 0:
-                    return action
-
-        # Diagonal checks
-        diag = board.diagonal()
-        anti_diag = numpy.fliplr(board).diagonal()
-        if abs(sum(diag)) == 2:
-            ind = numpy.where(diag == 0)[0][0]
-            action = numpy.ravel_multi_index(
-                (numpy.array([ind]), numpy.array([ind])), (3, 3)
-            )[0]
-            if self.player * sum(diag) > 0:
-                return action
-
-        if abs(sum(anti_diag)) == 2:
-            ind = numpy.where(anti_diag == 0)[0][0]
-            action = numpy.ravel_multi_index(
-                (numpy.array([ind]), numpy.array([2 - ind])), (3, 3)
-            )[0]
-            if self.player * sum(anti_diag) > 0:
-                return action
-
-        return action
-
-    def render(self):
-        print(self.board[::-1])
diff --git a/muzero_2net.py b/muzero_2net.py
index 642602da..fe9f6478 100644
--- a/muzero_2net.py
+++ b/muzero_2net.py
@@ -71,7 +71,6 @@ def __init__(self, game_name, config=None, split_resources_in=1):
 
         # 重命名路径，以便区分不同的模型
         self.config.results_path /= "muzero_2net"
-        self.config.training_steps = 100000
         # Fix random generator seed
         numpy.random.seed(self.config.seed)
         torch.manual_seed(self.config.seed)
diff --git a/muzero_general.py b/muzero_general.py
index 6d8363d9..b3fb9411 100644
--- a/muzero_general.py
+++ b/muzero_general.py
@@ -11,7 +11,7 @@
 
 from simplifiedMuZero.without_rb.game_play import GamePlay
 from simplifiedMuZero.without_rb.play_buffer import PlayBuffer
-from simplifiedMuZero.without_rb.trainer import Trainer
+from simplifiedMuZero.without_rb.trainer_no_PV import Trainer
 from muzero import load_model_menu, hyperparameter_search
 
 import models
@@ -61,6 +61,9 @@ def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save
             else:
                 self.config = config
 
+        # using random search instand of MCTS
+        self.config.temperature_threshold = 0
+
         # Fix random generator seed
         numpy.random.seed(self.config.seed)
         torch.manual_seed(self.config.seed)
diff --git a/simplifiedMuZero/without_rb/trainer_no_PV.py b/simplifiedMuZero/without_rb/trainer_no_PV.py
new file mode 100644
index 00000000..265b13c5
--- /dev/null
+++ b/simplifiedMuZero/without_rb/trainer_no_PV.py
@@ -0,0 +1,243 @@
+import numpy
+import torch
+import models
+
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, model_cls, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = model_cls(self.config)
+        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        # if initial_checkpoint["optimizer_state"] is not None:
+        #     print("Loading optimizer...\n")
+        #     self.optimizer.load_state_dict(
+        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
+        #     )
+
+    # # update weights 与 continuous update weights 的区别
+    # #   1. update weights 是实际计算更新network的权重
+    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+    #     next_batch = play_buffer.get_batch()
+    #     # Training loop
+    #     while self.training_step < self.config.training_steps and not terminate:
+    #         index_batch, batch = next_batch
+    #         next_batch = play_buffer.get_batch()
+    #         self.update_lr()
+    #         (
+    #             priorities,
+    #             total_loss,
+    #             value_loss,
+    #             reward_loss,
+    #             policy_loss,
+    #         ) = self.update_weights(batch)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1)
+
+        return value_loss, reward_loss, policy_loss

From cbf40609a195ed58987640572f5d62c1c7e201f5 Mon Sep 17 00:00:00 2001
From: chunchangshao <chunchangshao@gmail.com>
Date: Wed, 6 Sep 2023 12:40:50 +0100
Subject: [PATCH 9/9] synchronize modifications

---
 simplifiedMuZero/no_pv/trainer_no_pv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simplifiedMuZero/no_pv/trainer_no_pv.py b/simplifiedMuZero/no_pv/trainer_no_pv.py
index e4a6080c..f51e3ef8 100644
--- a/simplifiedMuZero/no_pv/trainer_no_pv.py
+++ b/simplifiedMuZero/no_pv/trainer_no_pv.py
@@ -251,7 +251,7 @@ def update_weights(self, batch):
 
         # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
         # loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
-        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        loss = reward_loss + policy_loss
         if self.config.PER:
             # Correct PER bias by using importance-sampling (IS) weights
             loss *= weight_batch