From 83bc5fb18c725590d0972d59d719d2184ba06535 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Wed, 15 Feb 2023 22:02:43 +0000 Subject: [PATCH 1/9] fix: fixed ray's error 'No module named aiohttp.signals' --- requirements.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.lock b/requirements.lock index 742f745f..4d7ba441 100644 --- a/requirements.lock +++ b/requirements.lock @@ -6,7 +6,7 @@ # absl-py==1.0.0 # via tensorboard -aiohttp==3.8.1 +aiohttp==3.7.4 # via # aiohttp-cors # ray @@ -16,7 +16,7 @@ aioredis==1.3.1 # via ray aiosignal==1.2.0 # via aiohttp -async-timeout==4.0.1 +async-timeout==3.0.1 # via # aiohttp # aioredis @@ -171,7 +171,7 @@ pytz==2021.3 # via pandas pyyaml==6.0 # via ray -ray==1.5.2 +ray==1.2 # via -r requirements.in redis==4.0.1 # via ray From 55582a8eda97c59a55bdf35ede27d41b62a5fd49 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Fri, 11 Aug 2023 04:33:17 +0100 Subject: [PATCH 2/9] merge representation network and dynamic network --- .gitignore | 4 +- models.py | 47 +- muzero.py | 4 + muzero_2net.py | 718 +++++++++++++++++ self_play.py | 127 ++- shared_storage.py | 10 +- simplifiedMuZero/RHEA.py | 12 + simplifiedMuZero/models2.py | 150 ++++ simplifiedMuZero/models_2net.py | 696 +++++++++++++++++ .../models_without_replay_buffer.py | 696 +++++++++++++++++ .../muzero_without_replay_buffer.py | 723 ++++++++++++++++++ simplifiedMuZero/replay_buffer3.py | 373 +++++++++ simplifiedMuZero/self_play_2net.py | 622 +++++++++++++++ .../self_play_without_replay_buffer.py | 624 +++++++++++++++ simplifiedMuZero/trainer_2net.py | 300 ++++++++ .../trainer_without_replay_buffer.py | 303 ++++++++ test/Simple_grid_test.py | 23 + test/ray_test.py | 20 + 18 files changed, 5388 insertions(+), 64 deletions(-) create mode 100644 muzero_2net.py create mode 100644 simplifiedMuZero/RHEA.py create mode 100644 simplifiedMuZero/models2.py create mode 100644 simplifiedMuZero/models_2net.py create mode 100644 simplifiedMuZero/models_without_replay_buffer.py create mode 100644 simplifiedMuZero/muzero_without_replay_buffer.py create mode 100644 simplifiedMuZero/replay_buffer3.py create mode 100644 simplifiedMuZero/self_play_2net.py create mode 100644 simplifiedMuZero/self_play_without_replay_buffer.py create mode 100644 simplifiedMuZero/trainer_2net.py create mode 100644 simplifiedMuZero/trainer_without_replay_buffer.py create mode 100644 test/Simple_grid_test.py create mode 100644 test/ray_test.py diff --git a/.gitignore b/.gitignore index f106bb6b..844f676b 100644 --- a/.gitignore +++ b/.gitignore @@ -90,4 +90,6 @@ venv.bak/ # mypy .mypy_cache/ .dmypy.json -dmypy.json \ No newline at end of file +dmypy.json + +results/ \ No newline at end of file diff --git a/models.py b/models.py index be847fef..d4b8bc2f 100644 --- a/models.py +++ b/models.py @@ -94,6 +94,7 @@ def __init__( super().__init__() self.action_space_size = action_space_size self.full_support_size = 2 * support_size + 1 + # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数 self.representation_network = torch.nn.DataParallel( mlp( @@ -107,6 +108,7 @@ def __init__( ) ) + #dynamics的输入是encoding_size+action_space_size self.dynamics_encoded_state_network = torch.nn.DataParallel( mlp( encoding_size + self.action_space_size, @@ -115,14 +117,14 @@ def __init__( ) ) self.dynamics_reward_network = torch.nn.DataParallel( - mlp(encoding_size, fc_reward_layers, self.full_support_size) + mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] ) self.prediction_policy_network = torch.nn.DataParallel( - mlp(encoding_size, fc_policy_layers, self.action_space_size) + mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率 ) self.prediction_value_network = torch.nn.DataParallel( - mlp(encoding_size, fc_value_layers, self.full_support_size) + mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] ) def prediction(self, encoded_state): @@ -134,16 +136,19 @@ def representation(self, observation): encoded_state = self.representation_network( observation.view(observation.shape[0], -1) ) + + # 正则化 # Scale encoded state between [0, 1] (See appendix paper Training) min_encoded_state = encoded_state.min(1, keepdim=True)[0] max_encoded_state = encoded_state.max(1, keepdim=True)[0] scale_encoded_state = max_encoded_state - min_encoded_state - scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 + scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN encoded_state_normalized = ( encoded_state - min_encoded_state ) / scale_encoded_state return encoded_state_normalized + # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入,而representation不需要绑定action def dynamics(self, encoded_state, action): # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture) action_one_hot = ( @@ -151,18 +156,19 @@ def dynamics(self, encoded_state, action): .to(action.device) .float() ) - action_one_hot.scatter_(1, action.long(), 1.0) + action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1 x = torch.cat((encoded_state, action_one_hot), dim=1) next_encoded_state = self.dynamics_encoded_state_network(x) reward = self.dynamics_reward_network(next_encoded_state) + # 正则化 # Scale encoded state between [0, 1] (See paper appendix Training) min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0] max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0] scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state - scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 + scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN next_encoded_state_normalized = ( next_encoded_state - min_next_encoded_state ) / scale_next_encoded_state @@ -172,7 +178,7 @@ def dynamics(self, encoded_state, action): def initial_inference(self, observation): encoded_state = self.representation(observation) policy_logits, value = self.prediction(encoded_state) - # reward equal to 0 for consistency + # reward equal to 0 for consistency 一致性奖励等于 0 reward = torch.log( ( torch.zeros(1, self.full_support_size) @@ -181,6 +187,7 @@ def initial_inference(self, observation): .to(observation.device) ) ) + # reward的样子为[[0,0,...,0,1,0,...,0,0],...]。即中间值为1,其余全为0,然后重复于observation行数相同的次数 return ( value, @@ -605,8 +612,8 @@ def initial_inference(self, observation): reward = torch.log( ( torch.zeros(1, self.full_support_size) - .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) - .repeat(len(observation), 1) + .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1 + .repeat(len(observation), 1) # 根据observation的长度复制,保证reward的维度于observation的一致,即之前的observation也赋值 .to(observation.device) ) ) @@ -637,29 +644,29 @@ def mlp( sizes = [input_size] + layer_sizes + [output_size] layers = [] for i in range(len(sizes) - 1): - act = activation if i < len(sizes) - 2 else output_activation + act = activation if i < len(sizes) - 2 else output_activation #激活函数,最后一层是output_activation,其余的都一样 layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()] return torch.nn.Sequential(*layers) -def support_to_scalar(logits, support_size): +def support_to_scalar(logits, support_size): # logits 是 value的对数值,support_size是转换后的范围。 """ Transform a categorical representation to a scalar See paper appendix Network Architecture """ # Decode to a scalar - probabilities = torch.softmax(logits, dim=1) + probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1,softmax扩大大的,缩小下的,shape为[stacked_size, fully_support_size] support = ( - torch.tensor([x for x in range(-support_size, support_size + 1)]) + torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1 .expand(probabilities.shape) .float() .to(device=probabilities.device) - ) - x = torch.sum(support * probabilities, dim=1, keepdim=True) + ) # shape 为【stacked_size, fully_support_size】, + x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1,fully_support_size】,因为dim=1,另外keep_dim=True,所有是【1,fully_support_size】而不是【fully_support_size] # Invert the scaling (defined in https://arxiv.org/abs/1805.11593) - x = torch.sign(x) * ( - ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) + x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1,大于0为1,0为0。主要是获取x的符号 + ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002 ** 2 - 1 ) @@ -675,9 +682,9 @@ def scalar_to_support(x, support_size): x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x # Encode on a vector - x = torch.clamp(x, -support_size, support_size) - floor = x.floor() - prob = x - floor + x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围,使x的范围定为[-support_size, support_size] + floor = x.floor() # floor向下取整,类似的,ceil为向上取整 + prob = x - floor # 减去整数,保留小数部分(因为在support_to_scala部分是index位置乘上概率) logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device) logits.scatter_( 2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1) diff --git a/muzero.py b/muzero.py index f7601c9b..3e075e96 100644 --- a/muzero.py +++ b/muzero.py @@ -43,6 +43,7 @@ def __init__(self, game_name, config=None, split_resources_in=1): # Load the game and the config from the module with the game name try: game_module = importlib.import_module("games." + game_name) + print("games." + game_name) self.Game = game_module.Game self.config = game_module.MuZeroConfig() except ModuleNotFoundError as err: @@ -671,7 +672,10 @@ def load_model_menu(muzero, game_name): choice = input("Invalid input, enter a number listed above: ") choice = int(choice) if choice == 0: + start_time = time.time() muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) elif choice == 1: load_model_menu(muzero, game_name) elif choice == 2: diff --git a/muzero_2net.py b/muzero_2net.py new file mode 100644 index 00000000..bfdc38b0 --- /dev/null +++ b/muzero_2net.py @@ -0,0 +1,718 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +sys.path.append("") + +import diagnose_model +import simplifiedMuZero.models_2net as models +import simplifiedMuZero.replay_buffer3 as replay_buffer +import simplifiedMuZero.self_play_2net as self_play +import shared_storage +import simplifiedMuZero.trainer_2net as trainer + + +class MuZero: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.SimplifiedMuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/self_play.py b/self_play.py index d90fe5db..d09c5e87 100644 --- a/self_play.py +++ b/self_play.py @@ -33,8 +33,8 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): shared_storage.get_info.remote("training_step") ) < self.config.training_steps and not ray.get( shared_storage.get_info.remote("terminate") - ): - self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) + ): # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 if not test_mode: game_history = self.play_game( @@ -107,6 +107,16 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): self.close_game() + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory def play_game( self, temperature, temperature_threshold, render, opponent, muzero_player ): @@ -116,7 +126,7 @@ def play_game( game_history = GameHistory() observation = self.game.reset() game_history.action_history.append(0) - game_history.observation_history.append(observation) + game_history.observation_history.append(observation) # 添加reset之后的observation game_history.reward_history.append(0) game_history.to_play_history.append(self.game.to_play()) @@ -128,7 +138,7 @@ def play_game( with torch.no_grad(): while ( not done and len(game_history.action_history) <= self.config.max_moves - ): + ): # 游戏没有结束且运行步数小于最大移动步长 assert ( len(numpy.array(observation).shape) == 3 ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" @@ -138,14 +148,17 @@ def play_game( stacked_observations = game_history.get_stacked_observations( -1, self.config.stacked_observations, len(self.config.action_space) ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + # 一下的if-else部分主要是为了选择一个动作 # Choose the action if opponent == "self" or muzero_player == self.game.to_play(): root, mcts_info = MCTS(self.config).run( self.model, stacked_observations, self.game.legal_actions(), - self.game.to_play(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 True, ) action = self.select_action( @@ -154,7 +167,7 @@ def play_game( if not temperature_threshold or len(game_history.action_history) < temperature_threshold else 0, - ) + ) # 根据temperature选择动作 if render: print(f'Tree depth: {mcts_info["max_tree_depth"]}') @@ -162,11 +175,11 @@ def play_game( f"Root value for player {self.game.to_play()}: {root.value():.2f}" ) else: - action, root = self.select_opponent_action( + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 opponent, stacked_observations ) - observation, reward, done = self.game.step(action) + observation, reward, done = self.game.step(action) # 运行游戏 if render: print(f"Played action: {self.game.action_to_string(action)}") @@ -176,7 +189,7 @@ def play_game( # Next batch game_history.action_history.append(action) - game_history.observation_history.append(observation) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 game_history.reward_history.append(reward) game_history.to_play_history.append(self.game.to_play()) @@ -219,7 +232,12 @@ def select_opponent_action(self, opponent, stacked_observations): 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' ) - @staticmethod + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 def select_action(node, temperature): """ Select action according to the visit count distribution and the temperature. @@ -257,6 +275,25 @@ class MCTS: def __init__(self, config): self.config = config + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action def run( self, model, @@ -272,7 +309,7 @@ def run( We then run a Monte Carlo Tree Search using only action sequences and the model learned by the network. """ - if override_root_with: + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 root = override_root_with root_predicted_value = None else: @@ -282,7 +319,7 @@ def run( .float() .unsqueeze(0) .to(next(model.parameters()).device) - ) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 ( root_predicted_value, reward, @@ -316,16 +353,17 @@ def run( min_max_stats = MinMaxStats() max_tree_depth = 0 - for _ in range(self.config.num_simulations): + for _ in range(self.config.num_simulations): # 开始模拟游戏 virtual_to_play = to_play node = root search_path = [node] current_tree_depth = 0 - while node.expanded(): + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 current_tree_depth += 1 - action, node = self.select_child(node, min_max_stats) - search_path.append(node) + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 # Players play turn by turn if virtual_to_play + 1 < len(self.config.players): @@ -333,15 +371,18 @@ def run( else: virtual_to_play = self.config.players[0] + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state # Inside the search tree we use the dynamics function to obtain the next hidden # state given an action and the previous hidden state - parent = search_path[-2] + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent value, reward, policy_logits, hidden_state = model.recurrent_inference( parent.hidden_state, torch.tensor([[action]]).to(parent.hidden_state.device), ) value = models.support_to_scalar(value, self.config.support_size).item() reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 node.expand( self.config.action_space, virtual_to_play, @@ -360,6 +401,9 @@ def run( } return root, extra_info + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action def select_child(self, node, min_max_stats): """ Select the child with the highest UCB score. @@ -368,7 +412,7 @@ def select_child(self, node, min_max_stats): self.ucb_score(node, child, min_max_stats) for action, child in node.children.items() ) - action = numpy.random.choice( + action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) [ action for action, child in node.children.items() @@ -377,33 +421,37 @@ def select_child(self, node, min_max_stats): ) return action, node.children[action] - def ucb_score(self, parent, child, min_max_stats): + def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 """ The score for a node is based on its value, plus an exploration bonus based on the prior. """ pb_c = ( math.log( - (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base + (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 ) + self.config.pb_c_init ) pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) - prior_score = pb_c * child.prior + prior_score = pb_c * child.prior # prior 之前的p_value + # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # prior_score = pbc * prior if child.visit_count > 0: # Mean value Q - value_score = min_max_stats.normalize( + value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 child.reward - + self.config.discount - * (child.value() if len(self.config.players) == 1 else -child.value()) + + self.config.discount # 衰减系数, 之后乘以子节点的值 + * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 ) else: value_score = 0 - return prior_score + value_score + return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 - def backpropagate(self, search_path, value, to_play, min_max_stats): + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 """ At the end of a simulation, we propagate the evaluation all the way up the tree to the root. @@ -432,7 +480,7 @@ def backpropagate(self, search_path, value, to_play, min_max_stats): class Node: def __init__(self, prior): - self.visit_count = 0 + self.visit_count = 0 #visit count默认是0,只有经过反向传播之后才能变成增加 self.to_play = -1 self.prior = prior self.value_sum = 0 @@ -449,6 +497,8 @@ def value(self): return self.value_sum / self.visit_count def expand(self, actions, to_play, reward, policy_logits, hidden_state): + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 """ We expand a node using the value, reward and policy prediction obtained from the neural network. @@ -460,7 +510,7 @@ def expand(self, actions, to_play, reward, policy_logits, hidden_state): policy_values = torch.softmax( torch.tensor([policy_logits[0][a] for a in actions]), dim=0 ).tolist() - policy = {a: policy_values[i] for i, a in enumerate(actions)} + policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值 for action, p in policy.items(): self.children[action] = Node(p) @@ -512,7 +562,7 @@ def store_search_statistics(self, root, action_space): def get_stacked_observations( self, index, num_stacked_observations, action_space_size - ): + ): #根据索引index获取observation序列 """ Generate a new observation with the observation at the index position and num_stacked_observations past observations and actions stacked. @@ -520,12 +570,12 @@ def get_stacked_observations( # Convert to positive index index = index % len(self.observation_history) - stacked_observations = self.observation_history[index].copy() + stacked_observations = self.observation_history[index].copy() #分为两部分,一部分是当前(current)观察值,一部分是之前的(previous)观察值 for past_observation_index in reversed( range(index - num_stacked_observations, index) ): if 0 <= past_observation_index: - previous_observation = numpy.concatenate( + previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来,方法是依次拆开每个元素,拼接 ( self.observation_history[past_observation_index], [ @@ -543,7 +593,7 @@ def get_stacked_observations( ) ) - stacked_observations = numpy.concatenate( + stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容 (stacked_observations, previous_observation) ) @@ -556,15 +606,16 @@ class MinMaxStats: """ def __init__(self): - self.maximum = -float("inf") - self.minimum = float("inf") + self.maximum = -float("inf") # 最大是-∞ + self.minimum = float("inf") # 最小是+∞ + # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max self.minimum: + def normalize(self, value): #对value规范化,公式为(x-a)/(a-b) 当x∈[a,b]时 + if self.maximum > self.minimum: # 如果最大大于最小,说明至少更新了两次(第一次更新掉max>> muzero = MuZero("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + #使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # 这里调用continuous类的函数,主要是continuous函数会调用replay_buffer, + + # Launch workers + # 此处调用worker进行self play,把结果存在replay_buffer里 + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + + # 此处使用trainer,从replay buffer里按batch抽取数据,进行网络训练和更新 + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + # 使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.MuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/simplifiedMuZero/replay_buffer3.py b/simplifiedMuZero/replay_buffer3.py new file mode 100644 index 00000000..762d5a0e --- /dev/null +++ b/simplifiedMuZero/replay_buffer3.py @@ -0,0 +1,373 @@ +import copy +import time + +import numpy +import ray +import torch + +import simplifiedMuZero.models_2net as models + + +@ray.remote +class ReplayBuffer: + """ + Class which run in a dedicated thread to store played games and generate batch. + """ + + def __init__(self, initial_checkpoint, initial_buffer, config): + self.config = config + self.buffer = copy.deepcopy(initial_buffer) + self.num_played_games = initial_checkpoint["num_played_games"] + self.num_played_steps = initial_checkpoint["num_played_steps"] + self.total_samples = sum( + [len(game_history.root_values) for game_history in self.buffer.values()] + ) + if self.total_samples != 0: + print( + f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n" + ) + + # Fix random generator seed + numpy.random.seed(self.config.seed) + + def save_game(self, game_history, shared_storage=None): + if self.config.PER: + if game_history.priorities is not None: + # Avoid read only array when loading replay buffer from disk + game_history.priorities = numpy.copy(game_history.priorities) + else: + # Initial priorities for the prioritized replay (See paper appendix Training) + priorities = [] + for i, root_value in enumerate(game_history.root_values): + priority = ( + numpy.abs( + root_value - self.compute_target_value(game_history, i) + ) + ** self.config.PER_alpha + ) + priorities.append(priority) + + game_history.priorities = numpy.array(priorities, dtype="float32") + game_history.game_priority = numpy.max(game_history.priorities) + + self.buffer[self.num_played_games] = game_history + self.num_played_games += 1 + self.num_played_steps += len(game_history.root_values) + self.total_samples += len(game_history.root_values) + + if self.config.replay_buffer_size < len(self.buffer): + del_id = self.num_played_games - len(self.buffer) + self.total_samples -= len(self.buffer[del_id].root_values) + del self.buffer[del_id] + + if shared_storage: + shared_storage.set_info.remote("num_played_games", self.num_played_games) + shared_storage.set_info.remote("num_played_steps", self.num_played_steps) + + def get_buffer(self): + return self.buffer + + def get_batch(self): + ( + index_batch, + observation_batch, + action_batch, + reward_batch, + value_batch, + policy_batch, + gradient_scale_batch, + ) = ([], [], [], [], [], [], []) + weight_batch = [] if self.config.PER else None + + for game_id, game_history, game_prob in self.sample_n_games( + self.config.batch_size + ): + game_pos, pos_prob = self.sample_position(game_history) + + values, rewards, policies, actions = self.make_target( + game_history, game_pos + ) + + index_batch.append([game_id, game_pos]) + observation_batch.append( + game_history.get_stacked_observations( + game_pos, + self.config.stacked_observations, + len(self.config.action_space), + ) + ) + action_batch.append(actions) + value_batch.append(values) + reward_batch.append(rewards) + policy_batch.append(policies) + gradient_scale_batch.append( + [ + min( + self.config.num_unroll_steps, + len(game_history.action_history) - game_pos, + ) + ] + * len(actions) + ) + if self.config.PER: + weight_batch.append(1 / (self.total_samples * game_prob * pos_prob)) + + if self.config.PER: + weight_batch = numpy.array(weight_batch, dtype="float32") / max( + weight_batch + ) + + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1 + # value_batch: batch, num_unroll_steps+1 + # reward_batch: batch, num_unroll_steps+1 + # policy_batch: batch, num_unroll_steps+1, len(action_space) + # weight_batch: batch + # gradient_scale_batch: batch, num_unroll_steps+1 + return ( + index_batch, + ( + observation_batch, + action_batch, + value_batch, + reward_batch, + policy_batch, + weight_batch, + gradient_scale_batch, + ), + ) + + def sample_game(self, force_uniform=False): + """ + Sample game from buffer either uniformly or according to some priority. + See paper appendix Training. + """ + game_prob = None + if self.config.PER and not force_uniform: + game_probs = numpy.array( + [game_history.game_priority for game_history in self.buffer.values()], + dtype="float32", + ) + game_probs /= numpy.sum(game_probs) + game_index = numpy.random.choice(len(self.buffer), p=game_probs) + game_prob = game_probs[game_index] + else: + game_index = numpy.random.choice(len(self.buffer)) + game_id = self.num_played_games - len(self.buffer) + game_index + + return game_id, self.buffer[game_id], game_prob + + def sample_n_games(self, n_games, force_uniform=False): + if self.config.PER and not force_uniform: + game_id_list = [] + game_probs = [] + for game_id, game_history in self.buffer.items(): + game_id_list.append(game_id) + game_probs.append(game_history.game_priority) + game_probs = numpy.array(game_probs, dtype="float32") + game_probs /= numpy.sum(game_probs) + game_prob_dict = dict( + [(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)] + ) + selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs) + else: + selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) + game_prob_dict = {} + ret = [ + (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) + for game_id in selected_games + ] + return ret + + def sample_position(self, game_history, force_uniform=False): + """ + Sample position from game either uniformly or according to some priority. + See paper appendix Training. + """ + position_prob = None + if self.config.PER and not force_uniform: + position_probs = game_history.priorities / sum(game_history.priorities) + position_index = numpy.random.choice(len(position_probs), p=position_probs) + position_prob = position_probs[position_index] + else: + position_index = numpy.random.choice(len(game_history.root_values)) + + return position_index, position_prob + + def update_game_history(self, game_id, game_history): + # The element could have been removed since its selection and update + if next(iter(self.buffer)) <= game_id: + if self.config.PER: + # Avoid read only array when loading replay buffer from disk + game_history.priorities = numpy.copy(game_history.priorities) + self.buffer[game_id] = game_history + + def update_priorities(self, priorities, index_info): + """ + Update game and position priorities with priorities calculated during the training. + See Distributed Prioritized Experience Replay https://arxiv.org/abs/1803.00933 + """ + for i in range(len(index_info)): + game_id, game_pos = index_info[i] + + # The element could have been removed since its selection and training + if next(iter(self.buffer)) <= game_id: + # Update position priorities + priority = priorities[i, :] + start_index = game_pos + end_index = min( + game_pos + len(priority), len(self.buffer[game_id].priorities) + ) + self.buffer[game_id].priorities[start_index:end_index] = priority[ + : end_index - start_index + ] + + # Update game priorities + self.buffer[game_id].game_priority = numpy.max( + self.buffer[game_id].priorities + ) + + def compute_target_value(self, game_history, index): + # The value target is the discounted root value of the search tree td_steps into the + # future, plus the discounted sum of all rewards until then. + bootstrap_index = index + self.config.td_steps + if bootstrap_index < len(game_history.root_values): + root_values = ( + game_history.root_values + if game_history.reanalysed_predicted_root_values is None + else game_history.reanalysed_predicted_root_values + ) + last_step_value = ( + root_values[bootstrap_index] + if game_history.to_play_history[bootstrap_index] + == game_history.to_play_history[index] + else -root_values[bootstrap_index] + ) + + value = last_step_value * self.config.discount**self.config.td_steps + else: + value = 0 + + for i, reward in enumerate( + game_history.reward_history[index + 1 : bootstrap_index + 1] + ): + # The value is oriented from the perspective of the current player + value += ( + reward + if game_history.to_play_history[index] + == game_history.to_play_history[index + i] + else -reward + ) * self.config.discount**i + + return value + + def make_target(self, game_history, state_index): + """ + Generate targets for every unroll steps. + """ + target_values, target_rewards, target_policies, actions = [], [], [], [] + for current_index in range( + state_index, state_index + self.config.num_unroll_steps + 1 + ): + value = self.compute_target_value(game_history, current_index) + + if current_index < len(game_history.root_values): + target_values.append(value) + target_rewards.append(game_history.reward_history[current_index]) + target_policies.append(game_history.child_visits[current_index]) + actions.append(game_history.action_history[current_index]) + elif current_index == len(game_history.root_values): + target_values.append(0) + target_rewards.append(game_history.reward_history[current_index]) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(game_history.action_history[current_index]) + else: + # States past the end of games are treated as absorbing states + target_values.append(0) + target_rewards.append(0) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(numpy.random.choice(self.config.action_space)) + + return target_values, target_rewards, target_policies, actions + + +@ray.remote +class Reanalyse: + """ + Class which run in a dedicated thread to update the replay buffer with fresh information. + See paper appendix Reanalyse. + """ + + def __init__(self, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = models.SimplifiedMuZeroNetwork(self.config) + self.model.set_weights(initial_checkpoint["weights"]) + self.model.to(torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu")) + self.model.eval() + + self.num_reanalysed_games = initial_checkpoint["num_reanalysed_games"] + + def reanalyse(self, replay_buffer, shared_storage): + while ray.get(shared_storage.get_info.remote("num_played_games")) < 1: + time.sleep(0.1) + + while ray.get( + shared_storage.get_info.remote("training_step") + ) < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") + ): + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) + + game_id, game_history, _ = ray.get( + replay_buffer.sample_game.remote(force_uniform=True) + ) + + # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) + if self.config.use_last_model_value: + observations = numpy.array( + [ + game_history.get_stacked_observations( + i, + self.config.stacked_observations, + len(self.config.action_space), + ) + for i in range(len(game_history.root_values)) + ] + ) + + observations = ( + torch.tensor(observations) + .float() + .to(next(self.model.parameters()).device) + ) + values = models.support_to_scalar( + self.model.initial_inference(observations)[0], + self.config.support_size, + ) + game_history.reanalysed_predicted_root_values = ( + torch.squeeze(values).detach().cpu().numpy() + ) + + replay_buffer.update_game_history.remote(game_id, game_history) + self.num_reanalysed_games += 1 + shared_storage.set_info.remote( + "num_reanalysed_games", self.num_reanalysed_games + ) diff --git a/simplifiedMuZero/self_play_2net.py b/simplifiedMuZero/self_play_2net.py new file mode 100644 index 00000000..af2a2e39 --- /dev/null +++ b/simplifiedMuZero/self_play_2net.py @@ -0,0 +1,622 @@ +import math +import time + +import numpy +import ray +import torch + +import simplifiedMuZero.models_2net as models + + +@ray.remote +class SelfPlay: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + self.model = models.SimplifiedMuZeroNetwork(self.config) + # self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(initial_checkpoint["weights"]) + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + + def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): + while ray.get( + shared_storage.get_info.remote("training_step") + ) < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") + ): # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 + + if not test_mode: + game_history = self.play_game( + self.config.visit_softmax_temperature_fn( + trained_steps=ray.get( + shared_storage.get_info.remote("training_step") + ) + ), + self.config.temperature_threshold, + False, + "self", + 0, + ) + + replay_buffer.save_game.remote(game_history, shared_storage) + + else: + # Take the best action (no exploration) in test mode + game_history = self.play_game( + 0, + self.config.temperature_threshold, + False, + "self" if len(self.config.players) == 1 else self.config.opponent, + self.config.muzero_player, + ) + + # Save to the shared storage + shared_storage.set_info.remote( + { + "episode_length": len(game_history.action_history) - 1, + "total_reward": sum(game_history.reward_history), + "mean_value": numpy.mean( + [value for value in game_history.root_values if value] + ), + } + ) + if 1 < len(self.config.players): + shared_storage.set_info.remote( + { + "muzero_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ), + "opponent_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ), + } + ) + + # Managing the self-play / training ratio + if not test_mode and self.config.self_play_delay: + time.sleep(self.config.self_play_delay) + if not test_mode and self.config.ratio: + while ( + ray.get(shared_storage.get_info.remote("training_step")) + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + < self.config.ratio + and ray.get(shared_storage.get_info.remote("training_step")) + < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + self.close_game() + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + if render: + self.game.render() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = self.select_action( + root, + temperature + if not temperature_threshold + or len(game_history.action_history) < temperature_threshold + else 0, + ) # 根据temperature选择动作 + + if render: + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print( + f"Root value for player {self.game.to_play()}: {root.value():.2f}" + ) + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + True, + ) + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") + print( + f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" + ) + return self.game.human_to_action(), root + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) + + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action + + +# Game independent +class MCTS: + """ + Core Monte Carlo Tree Search algorithm. + To decide on an action, we run N simulations, always starting at the root of + the search tree and traversing the tree according to the UCB formula until we + reach a leaf node. + """ + + def __init__(self, config): + self.config = config + + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action + def run( + self, + model, + observation, + legal_actions, + to_play, + add_exploration_noise, + override_root_with=None, + ): + """ + At the root of the search tree we use the representation function to obtain a + hidden state given the current observation. + We then run a Monte Carlo Tree Search using only action sequences and the model + learned by the network. + """ + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 + root = override_root_with + root_predicted_value = None + else: + root = Node(0) + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + root_predicted_value = models.support_to_scalar( + root_predicted_value, self.config.support_size + ).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + assert ( + legal_actions + ), f"Legal actions should not be an empty array. Got {legal_actions}." + assert set(legal_actions).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + root.expand( + legal_actions, + to_play, + reward, + policy_logits, + hidden_state, + ) + + if add_exploration_noise: + root.add_exploration_noise( + dirichlet_alpha=self.config.root_dirichlet_alpha, + exploration_fraction=self.config.root_exploration_fraction, + ) + + min_max_stats = MinMaxStats() + + max_tree_depth = 0 + for _ in range(self.config.num_simulations): # 开始模拟游戏 + virtual_to_play = to_play + node = root + search_path = [node] + current_tree_depth = 0 + + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 + current_tree_depth += 1 + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 + + # Players play turn by turn + if virtual_to_play + 1 < len(self.config.players): + virtual_to_play = self.config.players[virtual_to_play + 1] + else: + virtual_to_play = self.config.players[0] + + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state + # Inside the search tree we use the dynamics function to obtain the next hidden + # state given an action and the previous hidden state + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent + value, reward, policy_logits, hidden_state = model.recurrent_inference( + parent.hidden_state, + torch.tensor([[action]]).to(parent.hidden_state.device), + ) + value = models.support_to_scalar(value, self.config.support_size).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + node.expand( + self.config.action_space, + virtual_to_play, + reward, + policy_logits, + hidden_state, + ) + + self.backpropagate(search_path, value, virtual_to_play, min_max_stats) + + max_tree_depth = max(max_tree_depth, current_tree_depth) + + extra_info = { + "max_tree_depth": max_tree_depth, + "root_predicted_value": root_predicted_value, + } + return root, extra_info + + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action + def select_child(self, node, min_max_stats): + """ + Select the child with the highest UCB score. + """ + max_ucb = max( + self.ucb_score(node, child, min_max_stats) + for action, child in node.children.items() + ) + action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) + [ + action + for action, child in node.children.items() + if self.ucb_score(node, child, min_max_stats) == max_ucb + ] + ) + return action, node.children[action] + + def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 + """ + The score for a node is based on its value, plus an exploration bonus based on the prior. + """ + pb_c = ( + math.log( + (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 + ) + + self.config.pb_c_init + ) + pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) + + prior_score = pb_c * child.prior # prior 之前的p_value + # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # prior_score = pbc * prior + + if child.visit_count > 0: + # Mean value Q + value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 + child.reward + + self.config.discount # 衰减系数, 之后乘以子节点的值 + * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 + ) + else: + value_score = 0 + + return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 + + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 + """ + At the end of a simulation, we propagate the evaluation all the way up the tree + to the root. + """ + if len(self.config.players) == 1: + for node in reversed(search_path): + node.value_sum += value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * node.value()) + + value = node.reward + self.config.discount * value + + elif len(self.config.players) == 2: + for node in reversed(search_path): + node.value_sum += value if node.to_play == to_play else -value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * -node.value()) + + value = ( + -node.reward if node.to_play == to_play else node.reward + ) + self.config.discount * value + + else: + raise NotImplementedError("More than two player mode not implemented.") + + +class Node: + def __init__(self, prior): + self.visit_count = 0 #visit count默认是0,只有经过反向传播之后才能变成增加 + self.to_play = -1 + self.prior = prior + self.value_sum = 0 + self.children = {} + self.hidden_state = None + self.reward = 0 + + def expanded(self): + return len(self.children) > 0 + + def value(self): + if self.visit_count == 0: + return 0 + return self.value_sum / self.visit_count + + def expand(self, actions, to_play, reward, policy_logits, hidden_state): + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + """ + We expand a node using the value, reward and policy prediction obtained from the + neural network. + """ + self.to_play = to_play + self.reward = reward + self.hidden_state = hidden_state + + policy_values = torch.softmax( + torch.tensor([policy_logits[0][a] for a in actions]), dim=0 + ).tolist() + policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值 + for action, p in policy.items(): + self.children[action] = Node(p) + + def add_exploration_noise(self, dirichlet_alpha, exploration_fraction): + """ + At the start of each search, we add dirichlet noise to the prior of the root to + encourage the search to explore new actions. + """ + actions = list(self.children.keys()) + noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions)) + frac = exploration_fraction + for a, n in zip(actions, noise): + self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac + + +class GameHistory: + """ + Store only usefull information of a self-play game. + """ + + def __init__(self): + self.observation_history = [] + self.action_history = [] + self.reward_history = [] + self.to_play_history = [] + self.child_visits = [] + self.root_values = [] + self.reanalysed_predicted_root_values = None + # For PER + self.priorities = None + self.game_priority = None + + def store_search_statistics(self, root, action_space): + # Turn visit count from root into a policy + if root is not None: + sum_visits = sum(child.visit_count for child in root.children.values()) + self.child_visits.append( + [ + root.children[a].visit_count / sum_visits + if a in root.children + else 0 + for a in action_space + ] + ) + + self.root_values.append(root.value()) + else: + self.root_values.append(None) + + def get_stacked_observations( + self, index, num_stacked_observations, action_space_size + ): #根据索引index获取observation序列 + """ + Generate a new observation with the observation at the index position + and num_stacked_observations past observations and actions stacked. + """ + # Convert to positive index + index = index % len(self.observation_history) + + stacked_observations = self.observation_history[index].copy() #分为两部分,一部分是当前(current)观察值,一部分是之前的(previous)观察值 + for past_observation_index in reversed( + range(index - num_stacked_observations, index) + ): + if 0 <= past_observation_index: + previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来,方法是依次拆开每个元素,拼接 + ( + self.observation_history[past_observation_index], + [ + numpy.ones_like(stacked_observations[0]) + * self.action_history[past_observation_index + 1] + / action_space_size + ], + ) + ) + else: + previous_observation = numpy.concatenate( + ( + numpy.zeros_like(self.observation_history[index]), + [numpy.zeros_like(stacked_observations[0])], + ) + ) + + stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容 + (stacked_observations, previous_observation) + ) + + return stacked_observations + + +class MinMaxStats: + """ + A class that holds the min-max values of the tree. + """ + + def __init__(self): + self.maximum = -float("inf") # 最大是-∞ + self.minimum = float("inf") # 最小是+∞ + # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max self.minimum: # 如果最大大于最小,说明至少更新了两次(第一次更新掉max0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action + + +# Game independent +class MCTS: + """ + Core Monte Carlo Tree Search algorithm. + To decide on an action, we run N simulations, always starting at the root of + the search tree and traversing the tree according to the UCB formula until we + reach a leaf node. + """ + + def __init__(self, config): + self.config = config + + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action + def run( + self, + model, + observation, + legal_actions, + to_play, + add_exploration_noise, + override_root_with=None, + ): + """ + At the root of the search tree we use the representation function to obtain a + hidden state given the current observation. + We then run a Monte Carlo Tree Search using only action sequences and the model + learned by the network. + """ + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 + root = override_root_with + root_predicted_value = None + else: + root = Node(0) + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + root_predicted_value = models.support_to_scalar( + root_predicted_value, self.config.support_size + ).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + assert ( + legal_actions + ), f"Legal actions should not be an empty array. Got {legal_actions}." + assert set(legal_actions).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + root.expand( + legal_actions, + to_play, + reward, + policy_logits, + hidden_state, + ) + + if add_exploration_noise: + root.add_exploration_noise( + dirichlet_alpha=self.config.root_dirichlet_alpha, + exploration_fraction=self.config.root_exploration_fraction, + ) + + min_max_stats = MinMaxStats() + + max_tree_depth = 0 + for _ in range(self.config.num_simulations): # 开始模拟游戏 + virtual_to_play = to_play + node = root + search_path = [node] + current_tree_depth = 0 + + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 + current_tree_depth += 1 + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 + + # Players play turn by turn + if virtual_to_play + 1 < len(self.config.players): + virtual_to_play = self.config.players[virtual_to_play + 1] + else: + virtual_to_play = self.config.players[0] + + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state + # Inside the search tree we use the dynamics function to obtain the next hidden + # state given an action and the previous hidden state + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent + value, reward, policy_logits, hidden_state = model.recurrent_inference( + parent.hidden_state, + torch.tensor([[action]]).to(parent.hidden_state.device), + ) + value = models.support_to_scalar(value, self.config.support_size).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + node.expand( + self.config.action_space, + virtual_to_play, + reward, + policy_logits, + hidden_state, + ) + + self.backpropagate(search_path, value, virtual_to_play, min_max_stats) + + max_tree_depth = max(max_tree_depth, current_tree_depth) + + extra_info = { + "max_tree_depth": max_tree_depth, + "root_predicted_value": root_predicted_value, + } + return root, extra_info + + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action + def select_child(self, node, min_max_stats): + """ + Select the child with the highest UCB score. + """ + max_ucb = max( + self.ucb_score(node, child, min_max_stats) + for action, child in node.children.items() + ) + action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) + [ + action + for action, child in node.children.items() + if self.ucb_score(node, child, min_max_stats) == max_ucb + ] + ) + return action, node.children[action] + + def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 + """ + The score for a node is based on its value, plus an exploration bonus based on the prior. + """ + pb_c = ( + math.log( + (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 + ) + + self.config.pb_c_init + ) + pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) + + prior_score = pb_c * child.prior # prior 之前的p_value + # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # prior_score = pbc * prior + + if child.visit_count > 0: + # Mean value Q + value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 + child.reward + + self.config.discount # 衰减系数, 之后乘以子节点的值 + * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 + ) + else: + value_score = 0 + + return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 + + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 + """ + At the end of a simulation, we propagate the evaluation all the way up the tree + to the root. + """ + if len(self.config.players) == 1: + for node in reversed(search_path): + node.value_sum += value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * node.value()) + + value = node.reward + self.config.discount * value + + elif len(self.config.players) == 2: + for node in reversed(search_path): + node.value_sum += value if node.to_play == to_play else -value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * -node.value()) + + value = ( + -node.reward if node.to_play == to_play else node.reward + ) + self.config.discount * value + + else: + raise NotImplementedError("More than two player mode not implemented.") + + +class Node: + def __init__(self, prior): + self.visit_count = 0 #visit count默认是0,只有经过反向传播之后才能变成增加 + self.to_play = -1 + self.prior = prior + self.value_sum = 0 + self.children = {} + self.hidden_state = None + self.reward = 0 + + def expanded(self): + return len(self.children) > 0 + + def value(self): + if self.visit_count == 0: + return 0 + return self.value_sum / self.visit_count + + def expand(self, actions, to_play, reward, policy_logits, hidden_state): + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + """ + We expand a node using the value, reward and policy prediction obtained from the + neural network. + """ + self.to_play = to_play + self.reward = reward + self.hidden_state = hidden_state + + policy_values = torch.softmax( + torch.tensor([policy_logits[0][a] for a in actions]), dim=0 + ).tolist() + policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值 + for action, p in policy.items(): + self.children[action] = Node(p) + + def add_exploration_noise(self, dirichlet_alpha, exploration_fraction): + """ + At the start of each search, we add dirichlet noise to the prior of the root to + encourage the search to explore new actions. + """ + actions = list(self.children.keys()) + noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions)) + frac = exploration_fraction + for a, n in zip(actions, noise): + self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac + + +class GameHistory: + """ + Store only usefull information of a self-play game. + """ + + def __init__(self): + self.observation_history = [] + self.action_history = [] + self.reward_history = [] + self.to_play_history = [] + self.child_visits = [] + self.root_values = [] + self.reanalysed_predicted_root_values = None + # For PER + self.priorities = None + self.game_priority = None + + def store_search_statistics(self, root, action_space): + # Turn visit count from root into a policy + if root is not None: + sum_visits = sum(child.visit_count for child in root.children.values()) + self.child_visits.append( + [ + root.children[a].visit_count / sum_visits + if a in root.children + else 0 + for a in action_space + ] + ) + + self.root_values.append(root.value()) + else: + self.root_values.append(None) + + def get_stacked_observations( + self, index, num_stacked_observations, action_space_size + ): #根据索引index获取observation序列 + """ + Generate a new observation with the observation at the index position + and num_stacked_observations past observations and actions stacked. + """ + # Convert to positive index + index = index % len(self.observation_history) + + stacked_observations = self.observation_history[index].copy() #分为两部分,一部分是当前(current)观察值,一部分是之前的(previous)观察值 + for past_observation_index in reversed( + range(index - num_stacked_observations, index) + ): + if 0 <= past_observation_index: + previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来,方法是依次拆开每个元素,拼接 + ( + self.observation_history[past_observation_index], + [ + numpy.ones_like(stacked_observations[0]) + * self.action_history[past_observation_index + 1] + / action_space_size + ], + ) + ) + else: + previous_observation = numpy.concatenate( + ( + numpy.zeros_like(self.observation_history[index]), + [numpy.zeros_like(stacked_observations[0])], + ) + ) + + stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容 + (stacked_observations, previous_observation) + ) + + return stacked_observations + + +class MinMaxStats: + """ + A class that holds the min-max values of the tree. + """ + + def __init__(self): + self.maximum = -float("inf") # 最大是-∞ + self.minimum = float("inf") # 最小是+∞ + # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max self.minimum: # 如果最大大于最小,说明至少更新了两次(第一次更新掉max self.config.ratio + and self.training_step < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + if self.config.PER: + weight_batch = torch.tensor(weight_batch.copy()).float().to(device) + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + if self.config.PER: + # Correct PER bias by using importance-sampling (IS) weights + loss *= weight_batch + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss diff --git a/simplifiedMuZero/trainer_without_replay_buffer.py b/simplifiedMuZero/trainer_without_replay_buffer.py new file mode 100644 index 00000000..48236e0f --- /dev/null +++ b/simplifiedMuZero/trainer_without_replay_buffer.py @@ -0,0 +1,303 @@ +import copy +import time + +import numpy +import ray +import torch + +import models + + +@ray.remote +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + if initial_checkpoint["optimizer_state"] is not None: + print("Loading optimizer...\n") + self.optimizer.load_state_dict( + copy.deepcopy(initial_checkpoint["optimizer_state"]) + ) + + # update weights 与 continuous update weights 的区别 + # 1. update weights 是实际计算更新network的权重 + # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + def continuous_update_weights(self, replay_buffer, shared_storage): + # Wait for the replay buffer to be filled + while ray.get(shared_storage.get_info.remote("num_played_games")) < 1: + time.sleep(0.1) + + next_batch = replay_buffer.get_batch.remote() + # Training loop + while self.training_step < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") + ): + index_batch, batch = ray.get(next_batch) + next_batch = replay_buffer.get_batch.remote() + self.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = self.update_weights(batch) + + if self.config.PER: + # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933) + replay_buffer.update_priorities.remote(priorities, index_batch) + + # Save to the shared storage + if self.training_step % self.config.checkpoint_interval == 0: + shared_storage.set_info.remote( + { + "weights": copy.deepcopy(self.model.get_weights()), + "optimizer_state": copy.deepcopy( + models.dict_to_cpu(self.optimizer.state_dict()) + ), + } + ) + if self.config.save_model: + shared_storage.save_checkpoint.remote() + shared_storage.set_info.remote( + { + "training_step": self.training_step, + "lr": self.optimizer.param_groups[0]["lr"], + "total_loss": total_loss, + "value_loss": value_loss, + "reward_loss": reward_loss, + "policy_loss": policy_loss, + } + ) + + # Managing the self-play / training ratio + if self.config.training_delay: + time.sleep(self.config.training_delay) + if self.config.ratio: + while ( + self.training_step + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + > self.config.ratio + and self.training_step < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + if self.config.PER: + weight_batch = torch.tensor(weight_batch.copy()).float().to(device) + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + if self.config.PER: + # Correct PER bias by using importance-sampling (IS) weights + loss *= weight_batch + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss diff --git a/test/Simple_grid_test.py b/test/Simple_grid_test.py new file mode 100644 index 00000000..501ac2df --- /dev/null +++ b/test/Simple_grid_test.py @@ -0,0 +1,23 @@ +import numpy as np + +from games.simple_grid import Game +import random +import time + +g = Game() +observation = g.env.get_observation() + +# print(observer) +for i in range(1000): + actions = g.legal_actions() + observation, reward, done = g.step(random.choice(actions)) + # g.render() + print(np.array(observation).shape) + + if done: + break + + + # time.sleep(10) + +g.close() diff --git a/test/ray_test.py b/test/ray_test.py new file mode 100644 index 00000000..7d7f0cf6 --- /dev/null +++ b/test/ray_test.py @@ -0,0 +1,20 @@ +import ray +import time + +ray.init() + +@ray.remote +def hello(): + return "Hello world!" + +object_id = hello.remote() + +hello = ray.get(object_id) + +print(hello) + +# time.sleep(100) +results_ids = [ray.put(i) for i in range(10)] +print(ray.get(results_ids)) + +ray.shutdown() \ No newline at end of file From d5fc4874b8196616e033f547d7529ad1d47792b0 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Mon, 14 Aug 2023 08:28:37 +0100 Subject: [PATCH 3/9] remove replay buffer --- MuZero_No_Replay_Buffer.py | 1260 +++++++++++++++++ muzero_2net.py | 8 +- ...ffer.py => muzero_without_replay_buffer.py | 26 +- replay_buffer.py | 29 +- self_play.py | 2 +- simplifiedMuZero/__init__.py | 0 simplifiedMuZero/{ => net2}/models_2net.py | 25 +- .../replay_buffer_2net.py} | 4 +- simplifiedMuZero/{ => net2}/self_play_2net.py | 2 +- simplifiedMuZero/{ => net2}/trainer_2net.py | 4 +- simplifiedMuZero/{ => search_policy}/RHEA.py | 0 simplifiedMuZero/search_policy/__init__.py | 0 .../models_without_replay_buffer.py | 0 .../self_play_without_replay_buffer.py | 2 +- .../trainer_without_replay_buffer.py | 6 +- test/game_play_test.py | 696 +++++++++ trainer.py | 4 +- 17 files changed, 2020 insertions(+), 48 deletions(-) create mode 100644 MuZero_No_Replay_Buffer.py rename simplifiedMuZero/muzero_without_replay_buffer.py => muzero_without_replay_buffer.py (96%) create mode 100644 simplifiedMuZero/__init__.py rename simplifiedMuZero/{ => net2}/models_2net.py (98%) rename simplifiedMuZero/{replay_buffer3.py => net2/replay_buffer_2net.py} (98%) rename simplifiedMuZero/{ => net2}/self_play_2net.py (99%) rename simplifiedMuZero/{ => net2}/trainer_2net.py (98%) rename simplifiedMuZero/{ => search_policy}/RHEA.py (100%) create mode 100644 simplifiedMuZero/search_policy/__init__.py rename simplifiedMuZero/{ => without_rb}/models_without_replay_buffer.py (100%) rename simplifiedMuZero/{ => without_rb}/self_play_without_replay_buffer.py (99%) rename simplifiedMuZero/{ => without_rb}/trainer_without_replay_buffer.py (97%) create mode 100644 test/game_play_test.py diff --git a/MuZero_No_Replay_Buffer.py b/MuZero_No_Replay_Buffer.py new file mode 100644 index 00000000..bf280c71 --- /dev/null +++ b/MuZero_No_Replay_Buffer.py @@ -0,0 +1,1260 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +import diagnose_model +# import simplifiedMuZero.without_rb.models_without_replay_buffer as models +import models +# import replay_buffer +# import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play +import shared_storage +# import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer +from self_play import MCTS, GameHistory +from muzero import load_model_menu, CPUActor + +# training_step是一个全局变量,用来存储现有的运行次数,不要超过游戏config里的training_steps,如30000次 + +class GamePlay: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(initial_checkpoint["weights"]) + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + self.trained_steps = initial_checkpoint["training_step"] + self.terminate = False + + def continuous_self_play(self, test_mode=False): + # def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): + while self.trained_steps < self.config.training_steps and not self.terminate: # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 + # 此处不要用set——weights,因为现在移除了replay_buffer,不需要shared_storage了 + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 + + if not test_mode: + # game_history = self.play_game( + # self.config.visit_softmax_temperature_fn( + # trained_steps=ray.get( + # shared_storage.get_info.remote("training_step") + # ) + # ), + # self.config.temperature_threshold, + # False, + # "self", + # 0, + # ) + game_history = self.play_game( + self.config.visit_softmax_temperature_fn( + self.trained_steps + ), + self.config.temperature_threshold, + False, + "self", + 0, + ) + + # replay_buffer.save_game.remote(game_history, shared_storage) + return game_history + + else: + # Take the best action (no exploration) in test mode # 在测试模式下采取最佳行动(无探索) + game_history = self.play_game( + 0, + self.config.temperature_threshold, + False, + "self" if len(self.config.players) == 1 else self.config.opponent, + self.config.muzero_player, + ) + + # Save to the shared storage + shared_storage.set_info.remote( + { + "episode_length": len(game_history.action_history) - 1, + "total_reward": sum(game_history.reward_history), + "mean_value": numpy.mean( + [value for value in game_history.root_values if value] + ), + } + ) + if 1 < len(self.config.players): + shared_storage.set_info.remote( + { + "muzero_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ), + "opponent_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ), + } + ) + + # Managing the self-play / training ratio + if not test_mode and self.config.self_play_delay: + time.sleep(self.config.self_play_delay) + if not test_mode and self.config.ratio: + while ( + ray.get(shared_storage.get_info.remote("training_step")) + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + < self.config.ratio + and ray.get(shared_storage.get_info.remote("training_step")) + < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + self.close_game() + + # play game 与continuous self play 的区别: + # 1. play game 是实际运行游戏,游戏的结果存在game history里,不向replay buffer里写 + # 2. continuous self play 调用play game,把获取到的game history 异步写进 replay buffer + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + if render: + self.game.render() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = self.select_action( + root, + temperature + if not temperature_threshold + or len(game_history.action_history) < temperature_threshold + else 0, + ) # 根据temperature选择动作 + + if render: + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print( + f"Root value for player {self.game.to_play()}: {root.value():.2f}" + ) + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + True, + ) + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") + print( + f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" + ) + return self.game.human_to_action(), root + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) + + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action + +class Trainer_without_Replay_Buffer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + if initial_checkpoint["optimizer_state"] is not None: + print("Loading optimizer...\n") + self.optimizer.load_state_dict( + copy.deepcopy(initial_checkpoint["optimizer_state"]) + ) + + # update weights 与 continuous update weights 的区别 + # 1. update weights 是实际计算更新network的权重 + # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + def continuous_update_weights(self, replay_buffer, shared_storage): + # Wait for the replay buffer to be filled + while ray.get(shared_storage.get_info.remote("num_played_games")) < 1: + time.sleep(0.1) + + next_batch = replay_buffer.get_batch.remote() + # Training loop + while self.training_step < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") + ): + index_batch, batch = ray.get(next_batch) + next_batch = replay_buffer.get_batch.remote() + self.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = self.update_weights(batch) + + if self.config.PER: + # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933) + replay_buffer.update_priorities.remote(priorities, index_batch) + + # Save to the shared storage + if self.training_step % self.config.checkpoint_interval == 0: + shared_storage.set_info.remote( + { + "weights": copy.deepcopy(self.model.get_weights()), + "optimizer_state": copy.deepcopy( + models.dict_to_cpu(self.optimizer.state_dict()) + ), + } + ) + if self.config.save_model: + shared_storage.save_checkpoint.remote() + shared_storage.set_info.remote( + { + "training_step": self.training_step, + "lr": self.optimizer.param_groups[0]["lr"], + "total_loss": total_loss, + "value_loss": value_loss, + "reward_loss": reward_loss, + "policy_loss": policy_loss, + } + ) + + # Managing the self-play / training ratio + if self.config.training_delay: + time.sleep(self.config.training_delay) + if self.config.ratio: + while ( + self.training_step + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + > self.config.ratio + and self.training_step < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + if self.config.PER: + weight_batch = torch.tensor(weight_batch.copy()).float().to(device) + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + if self.config.PER: + # Correct PER bias by using importance-sampling (IS) weights + loss *= weight_batch + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + # 此处才算一次迭代完成,training step加1 + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss + +class MuZero_No_Replay_Buffer: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero_No_Replay_Buffer("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + # cpu_actor = CPUActor.remote() + # cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + # 移除ray + cpu_actor = CPUActor() + cpu_weights = cpu_actor.get_initial_weights(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + # self.training_worker = trainer.Trainer.options( + # num_cpus=0, + # num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + # ).remote(self.checkpoint, self.config) + # + # self.shared_storage_worker = shared_storage.SharedStorage.remote( + # self.checkpoint, + # self.config, + # ) + # self.shared_storage_worker.set_info.remote("terminate", False) + # + # self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + # self.checkpoint, self.replay_buffer, self.config + # ) + + # 初始化权重 + self.training_worker = Trainer_without_Replay_Buffer(self.checkpoint, self.config) + + # #使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) + # if self.config.use_last_model_value: + # self.reanalyse_worker = replay_buffer.Reanalyse.options( + # num_cpus=0, + # num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + # ).remote(self.checkpoint, self.config) + # + # self.self_play_workers = [ + # self_play.SelfPlay.options( + # num_cpus=0, + # num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + # ).remote( + # self.checkpoint, + # self.Game, + # self.config, + # self.config.seed + seed, + # ) + # for seed in range(self.config.num_workers) + # ] + # + # # 这里调用continuous类的函数,主要是continuous函数会调用replay_buffer, + # + # # Launch workers + # # 此处调用worker进行self play,把结果存在replay_buffer里 + # [ + # self_play_worker.continuous_self_play.remote( + # self.shared_storage_worker, self.replay_buffer_worker + # ) + # for self_play_worker in self.self_play_workers + # ] + + # # 此处使用trainer,从replay buffer里按batch抽取数据,进行网络训练和更新 + # self.training_worker.continuous_update_weights.remote( + # self.replay_buffer_worker, self.shared_storage_worker + # ) + self.training_worker.continuous_update_weights(self.replay_buffer_worker, self.shared_storage_worker) + + # # 使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) + # if self.config.use_last_model_value: + # self.reanalyse_worker.reanalyse.remote( + # self.replay_buffer_worker, self.shared_storage_worker + # ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + # 此处是将replay buffer的结果写入文件保持 + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + # pickle用来存储和导入文件,其作用是将对象转换为字符串或者将字符串转换为对象 + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + # 此处更新replay buffer的值 + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +# @ray.remote(num_cpus=0, num_gpus=0) +# class CPUActor: +# # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU +# def __init__(self): +# pass +# +# def get_initial_weights(self, config): +# model = models.MuZeroNetwork(config) +# weigths = model.get_weights() +# summary = str(model).replace("\n", " \n\n") +# return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero_No_Replay_Buffer): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero_No_Replay_Buffer(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero_No_Replay_Buffer(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero_No_Replay_Buffer(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero_No_Replay_Buffer(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_2net.py b/muzero_2net.py index bfdc38b0..d03457ec 100644 --- a/muzero_2net.py +++ b/muzero_2net.py @@ -16,11 +16,11 @@ sys.path.append("") import diagnose_model -import simplifiedMuZero.models_2net as models -import simplifiedMuZero.replay_buffer3 as replay_buffer -import simplifiedMuZero.self_play_2net as self_play +import simplifiedMuZero.net2.models_2net as models +import simplifiedMuZero.net2.replay_buffer_2net as replay_buffer +import simplifiedMuZero.net2.self_play_2net as self_play import shared_storage -import simplifiedMuZero.trainer_2net as trainer +import simplifiedMuZero.net2.trainer_2net as trainer class MuZero: diff --git a/simplifiedMuZero/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py similarity index 96% rename from simplifiedMuZero/muzero_without_replay_buffer.py rename to muzero_without_replay_buffer.py index 37436e79..e0a63690 100644 --- a/simplifiedMuZero/muzero_without_replay_buffer.py +++ b/muzero_without_replay_buffer.py @@ -14,14 +14,14 @@ from torch.utils.tensorboard import SummaryWriter import diagnose_model -import models -import replay_buffer -import self_play +import simplifiedMuZero.without_rb.models_without_replay_buffer as models +# import replay_buffer +import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play import shared_storage -import trainer +import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer -class MuZero: +class MuZero_Without_Replay_Buffer: """ Main class to manage MuZero. @@ -34,7 +34,7 @@ class MuZero: split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. Example: - >>> muzero = MuZero("cartpole") + >>> muzero = MuZero_Without_Replay_Buffer("cartpole") >>> muzero.train() >>> muzero.test(render=True) """ @@ -530,7 +530,7 @@ def hyperparameter_search( if 0 < budget: param = optimizer.ask() print(f"Launching new experiment: {param.value}") - muzero = MuZero(game_name, param.value, parallel_experiments) + muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments) muzero.param = param muzero.train(False) running_experiments.append(muzero) @@ -556,7 +556,7 @@ def hyperparameter_search( if 0 < budget: param = optimizer.ask() print(f"Launching new experiment: {param.value}") - muzero = MuZero(game_name, param.value, parallel_experiments) + muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments) muzero.param = param muzero.train(False) running_experiments[i] = muzero @@ -566,7 +566,7 @@ def hyperparameter_search( except KeyboardInterrupt: for experiment in running_experiments: - if isinstance(experiment, MuZero): + if isinstance(experiment, MuZero_Without_Replay_Buffer): experiment.terminate_workers() recommendation = optimizer.provide_recommendation() @@ -630,12 +630,12 @@ def load_model_menu(muzero, game_name): if __name__ == "__main__": if len(sys.argv) == 2: # Train directly with: python muzero.py cartpole - muzero = MuZero(sys.argv[1]) + muzero = MuZero_Without_Replay_Buffer(sys.argv[1]) muzero.train() elif len(sys.argv) == 3: # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' config = json.loads(sys.argv[2]) - muzero = MuZero(sys.argv[1], config) + muzero = MuZero_Without_Replay_Buffer(sys.argv[1], config) muzero.train() else: print("\nWelcome to MuZero! Here's a list of games:") @@ -655,7 +655,7 @@ def load_model_menu(muzero, game_name): # Initialize MuZero choice = int(choice) game_name = games[choice] - muzero = MuZero(game_name) + muzero = MuZero_Without_Replay_Buffer(game_name) while True: # Configure running options @@ -715,7 +715,7 @@ def load_model_menu(muzero, game_name): best_hyperparameters = hyperparameter_search( game_name, parametrization, budget, parallel_experiments, 20 ) - muzero = MuZero(game_name, best_hyperparameters) + muzero = MuZero_Without_Replay_Buffer(game_name, best_hyperparameters) else: break print("\nDone") diff --git a/replay_buffer.py b/replay_buffer.py index 81bc813e..cc1115db 100644 --- a/replay_buffer.py +++ b/replay_buffer.py @@ -16,7 +16,7 @@ class ReplayBuffer: def __init__(self, initial_checkpoint, initial_buffer, config): self.config = config - self.buffer = copy.deepcopy(initial_buffer) + self.buffer = copy.deepcopy(initial_buffer) # buffer是一个字典,key是game id,value是game_history self.num_played_games = initial_checkpoint["num_played_games"] self.num_played_steps = initial_checkpoint["num_played_steps"] self.total_samples = sum( @@ -79,11 +79,14 @@ def get_batch(self): ) = ([], [], [], [], [], [], []) weight_batch = [] if self.config.PER else None + # 从buffer里抽取n鸽样本,有probs的话安装probs的概率抽取,没有的话按照uniform抽取 for game_id, game_history, game_prob in self.sample_n_games( self.config.batch_size ): + # 每个game_history都是一个游戏运行的序列,使用sample_position从这些序列里随机抽取一个位置 game_pos, pos_prob = self.sample_position(game_history) + # 计算从该位置开始的值,rewards等数据 values, rewards, policies, actions = self.make_target( game_history, game_pos ) @@ -165,11 +168,11 @@ def sample_n_games(self, n_games, force_uniform=False): game_id_list.append(game_id) game_probs.append(game_history.game_priority) game_probs = numpy.array(game_probs, dtype="float32") - game_probs /= numpy.sum(game_probs) + game_probs /= numpy.sum(game_probs) # 每一个都除以game_probs的总和,可以看成是归一化 game_prob_dict = dict( [(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)] ) - selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs) + selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs) # 抽取n个样本, 抽取的概率是根据game_probs确定的 else: selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) game_prob_dict = {} @@ -177,10 +180,11 @@ def sample_n_games(self, n_games, force_uniform=False): (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) for game_id in selected_games ] - return ret + return ret # ret格式为[game_id, game_history, game_prob] def sample_position(self, game_history, force_uniform=False): """ + 统一或根据某些优先级从游戏中采样位置。 Sample position from game either uniformly or according to some priority. See paper appendix Training. """ @@ -230,6 +234,8 @@ def update_priorities(self, priorities, index_info): def compute_target_value(self, game_history, index): # The value target is the discounted root value of the search tree td_steps into the # future, plus the discounted sum of all rewards until then. + # 价值目标是未来搜索树 td_steps 的折扣根值,加上到那时为止的所有奖励的折扣总和。 + # 计算公式 ∑r*γ^n bootstrap_index = index + self.config.td_steps if bootstrap_index < len(game_history.root_values): root_values = ( @@ -237,6 +243,8 @@ def compute_target_value(self, game_history, index): if game_history.reanalysed_predicted_root_values is None else game_history.reanalysed_predicted_root_values ) + + # 检查当前的id和目标id是否一致,如果不一致则取负 last_step_value = ( root_values[bootstrap_index] if game_history.to_play_history[bootstrap_index] @@ -244,13 +252,15 @@ def compute_target_value(self, game_history, index): else -root_values[bootstrap_index] ) + # 计算公式 r*γ^n value = last_step_value * self.config.discount**self.config.td_steps - else: + else: # 因为终点的长度超过了数据,因此设为0 value = 0 for i, reward in enumerate( - game_history.reward_history[index + 1 : bootstrap_index + 1] + game_history.reward_history[index + 1 : bootstrap_index + 1] # 获取reward,从index+1到最大(如果长度不够则只会取到最后) ): + # 根据对手决定正负号,只会累计到value上 # The value is oriented from the perspective of the current player value += ( reward @@ -259,12 +269,13 @@ def compute_target_value(self, game_history, index): else -reward ) * self.config.discount**i - return value + return value # 返回value def make_target(self, game_history, state_index): """ Generate targets for every unroll steps. """ + # target policies 是 策略选择的概率序列,如[[0.4,0.6], [0.5,0.5],...] target_values, target_rewards, target_policies, actions = [], [], [], [] for current_index in range( state_index, state_index + self.config.num_unroll_steps + 1 @@ -280,6 +291,7 @@ def make_target(self, game_history, state_index): target_values.append(0) target_rewards.append(game_history.reward_history[current_index]) # Uniform policy + # 因为是游戏结束的状态,因此选择各个策略的概率是平均分布的 target_policies.append( [ 1 / len(game_history.child_visits[0]) @@ -287,8 +299,9 @@ def make_target(self, game_history, state_index): ] ) actions.append(game_history.action_history[current_index]) - else: + else: # 如果current index 大于 game_history的长度 # States past the end of games are treated as absorbing states + # 游戏结束后的状态被视为吸收状态,因此都为0 target_values.append(0) target_rewards.append(0) # Uniform policy diff --git a/self_play.py b/self_play.py index d09c5e87..c62802f7 100644 --- a/self_play.py +++ b/self_play.py @@ -128,7 +128,7 @@ def play_game( game_history.action_history.append(0) game_history.observation_history.append(observation) # 添加reset之后的observation game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) + game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的 done = False diff --git a/simplifiedMuZero/__init__.py b/simplifiedMuZero/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/simplifiedMuZero/models_2net.py b/simplifiedMuZero/net2/models_2net.py similarity index 98% rename from simplifiedMuZero/models_2net.py rename to simplifiedMuZero/net2/models_2net.py index 0a5428df..b62de9db 100644 --- a/simplifiedMuZero/models_2net.py +++ b/simplifiedMuZero/net2/models_2net.py @@ -100,20 +100,21 @@ def __init__( stacked_observations + 1) \ + stacked_observations * observation_shape[1] * observation_shape[2] + # 输出等于输入,即编码维度等于输入维度 encoding_size = representation_input_size - self.representation_network = torch.nn.DataParallel( - # mlp( - # representation_input_size, - # fc_representation_layers, - # encoding_size, - # ) - mlp( - representation_input_size + self.action_space_size, - fc_representation_layers, - encoding_size, - ) - ) + # self.representation_network = torch.nn.DataParallel( + # # mlp( + # # representation_input_size, + # # fc_representation_layers, + # # encoding_size, + # # ) + # mlp( + # representation_input_size + self.action_space_size, + # fc_representation_layers, + # encoding_size, + # ) + # ) #dynamics的输入是encoding_size+action_space_size self.dynamics_encoded_state_network = torch.nn.DataParallel( diff --git a/simplifiedMuZero/replay_buffer3.py b/simplifiedMuZero/net2/replay_buffer_2net.py similarity index 98% rename from simplifiedMuZero/replay_buffer3.py rename to simplifiedMuZero/net2/replay_buffer_2net.py index 762d5a0e..55522b86 100644 --- a/simplifiedMuZero/replay_buffer3.py +++ b/simplifiedMuZero/net2/replay_buffer_2net.py @@ -5,7 +5,7 @@ import ray import torch -import simplifiedMuZero.models_2net as models +import simplifiedMuZero.net2.models_2net as models @ray.remote @@ -31,7 +31,7 @@ def __init__(self, initial_checkpoint, initial_buffer, config): numpy.random.seed(self.config.seed) def save_game(self, game_history, shared_storage=None): - if self.config.PER: + if self.config.PER: # config.PER指的是优先重放 Prioritized Replay(参见论文附录训练),优先选择重放缓冲区中网络意外的元素 if game_history.priorities is not None: # Avoid read only array when loading replay buffer from disk game_history.priorities = numpy.copy(game_history.priorities) diff --git a/simplifiedMuZero/self_play_2net.py b/simplifiedMuZero/net2/self_play_2net.py similarity index 99% rename from simplifiedMuZero/self_play_2net.py rename to simplifiedMuZero/net2/self_play_2net.py index af2a2e39..a0a208a8 100644 --- a/simplifiedMuZero/self_play_2net.py +++ b/simplifiedMuZero/net2/self_play_2net.py @@ -5,7 +5,7 @@ import ray import torch -import simplifiedMuZero.models_2net as models +import simplifiedMuZero.net2.models_2net as models @ray.remote diff --git a/simplifiedMuZero/trainer_2net.py b/simplifiedMuZero/net2/trainer_2net.py similarity index 98% rename from simplifiedMuZero/trainer_2net.py rename to simplifiedMuZero/net2/trainer_2net.py index 244fb7ee..19888cf2 100644 --- a/simplifiedMuZero/trainer_2net.py +++ b/simplifiedMuZero/net2/trainer_2net.py @@ -5,7 +5,7 @@ import ray import torch -import simplifiedMuZero.models_2net as models +import simplifiedMuZero.net2.models_2net as models @ray.remote @@ -69,6 +69,8 @@ def continuous_update_weights(self, replay_buffer, shared_storage): shared_storage.get_info.remote("terminate") ): index_batch, batch = ray.get(next_batch) + print("train batch size is : ", batch[0].shape) + print("train index_batch size is : ", index_batch.shape) next_batch = replay_buffer.get_batch.remote() self.update_lr() ( diff --git a/simplifiedMuZero/RHEA.py b/simplifiedMuZero/search_policy/RHEA.py similarity index 100% rename from simplifiedMuZero/RHEA.py rename to simplifiedMuZero/search_policy/RHEA.py diff --git a/simplifiedMuZero/search_policy/__init__.py b/simplifiedMuZero/search_policy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/simplifiedMuZero/models_without_replay_buffer.py b/simplifiedMuZero/without_rb/models_without_replay_buffer.py similarity index 100% rename from simplifiedMuZero/models_without_replay_buffer.py rename to simplifiedMuZero/without_rb/models_without_replay_buffer.py diff --git a/simplifiedMuZero/self_play_without_replay_buffer.py b/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py similarity index 99% rename from simplifiedMuZero/self_play_without_replay_buffer.py rename to simplifiedMuZero/without_rb/self_play_without_replay_buffer.py index 89174d92..7e0d6512 100644 --- a/simplifiedMuZero/self_play_without_replay_buffer.py +++ b/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py @@ -5,7 +5,7 @@ # import ray import torch -import models +import simplifiedMuZero.without_rb.models_without_replay_buffer as models # @ray.remote diff --git a/simplifiedMuZero/trainer_without_replay_buffer.py b/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py similarity index 97% rename from simplifiedMuZero/trainer_without_replay_buffer.py rename to simplifiedMuZero/without_rb/trainer_without_replay_buffer.py index 48236e0f..e2f64fa2 100644 --- a/simplifiedMuZero/trainer_without_replay_buffer.py +++ b/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py @@ -2,10 +2,10 @@ import time import numpy -import ray +# import ray import torch -import models +import simplifiedMuZero.without_rb.models_without_replay_buffer as models @ray.remote @@ -69,7 +69,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage): next_batch = replay_buffer.get_batch.remote() # Training loop while self.training_step < self.config.training_steps and not ray.get( - shared_storage.get_info.remote("terminate") + shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 ): index_batch, batch = ray.get(next_batch) next_batch = replay_buffer.get_batch.remote() diff --git a/test/game_play_test.py b/test/game_play_test.py new file mode 100644 index 00000000..60b6a5ec --- /dev/null +++ b/test/game_play_test.py @@ -0,0 +1,696 @@ +from self_play import MCTS, GameHistory +from games.simple_grid import MuZeroConfig, Game +# from games.tictactoe import MuZeroConfig, Game +import models + +import numpy +import torch + +import math +import time +import copy + +class MySelfPlay: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, model, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + # self.model = models.MuZeroNetwork(self.config) + # self.model.set_weights(initial_checkpoint["weights"]) + self.model = model + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + self.trained_steps = initial_checkpoint["training_step"] + self.terminate = False + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + game_id = None + + if render: + self.game.render() + + game_id = self.game.to_play() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = self.select_action( + root, + temperature + if not temperature_threshold + or len(game_history.action_history) < temperature_threshold + else 0, + ) # 根据temperature选择动作 + + if render: + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print( + f"Root value for player {self.game.to_play()}: {root.value():.2f}" + ) + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_id, game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + True, + ) + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") + print( + f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" + ) + return self.game.human_to_action(), root + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) + + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action + +class PlayBuffer: + """ + Class which run in a dedicated thread to store played games and generate batch. + """ + + def __init__(self, initial_checkpoint, initial_buffer, config): + self.config = config + self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{} + self.num_played_games = initial_checkpoint["num_played_games"] + self.num_played_steps = initial_checkpoint["num_played_steps"] + self.total_samples = sum( + [len(game_history.root_values) for game_history in self.buffer.values()] + ) + if self.total_samples != 0: + print( + f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n" + ) + + # Fix random generator seed + numpy.random.seed(self.config.seed) + + def save_game(self, game_history): + self.buffer[self.num_played_games] = game_history + self.num_played_games += 1 + self.num_played_steps += len(game_history.root_values) + self.total_samples += len(game_history.root_values) + + if self.config.replay_buffer_size < len(self.buffer): + del_id = self.num_played_games - len(self.buffer) + self.total_samples -= len(self.buffer[del_id].root_values) + del self.buffer[del_id] + + def get_buffer(self): + return self.buffer + + def get_batch(self): + ( + index_batch, + observation_batch, + action_batch, + reward_batch, + value_batch, + policy_batch, + gradient_scale_batch, + ) = ([], [], [], [], [], [], []) + weight_batch = None + + for game_id, game_history, game_prob in self.sample_n_games( + self.config.batch_size + ): + game_pos, pos_prob = self.sample_position(game_history) + + values, rewards, policies, actions = self.make_target( + game_history, game_pos + ) + + index_batch.append([game_id, game_pos]) + observation_batch.append( + game_history.get_stacked_observations( + game_pos, + self.config.stacked_observations, + len(self.config.action_space), + ) + ) + action_batch.append(actions) + value_batch.append(values) + reward_batch.append(rewards) + policy_batch.append(policies) + gradient_scale_batch.append( + [ + min( + self.config.num_unroll_steps, + len(game_history.action_history) - game_pos, + ) + ] + * len(actions) + ) + + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1 + # value_batch: batch, num_unroll_steps+1 + # reward_batch: batch, num_unroll_steps+1 + # policy_batch: batch, num_unroll_steps+1, len(action_space) + # weight_batch: batch + # gradient_scale_batch: batch, num_unroll_steps+1 + return ( + index_batch, + ( + observation_batch, + action_batch, + value_batch, + reward_batch, + policy_batch, + weight_batch, + gradient_scale_batch, + ), + ) + + def sample_game(self, force_uniform=True): #将force_uniform 设置为True,强制安装平均分布选取 + """ + Sample game from buffer either uniformly or according to some priority. + See paper appendix Training. + """ + game_prob = None + + game_index = numpy.random.choice(len(self.buffer)) + game_id = self.num_played_games - len(self.buffer) + game_index + + return game_id, self.buffer[game_id], game_prob + + def sample_n_games(self, n_games): + selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) + game_prob_dict = {} + ret = [ + (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) + for game_id in selected_games + ] + return ret + + def sample_position(self, game_history): + """ + Sample position from game either uniformly or according to some priority. + See paper appendix Training. + """ + position_prob = None + + position_index = numpy.random.choice(len(game_history.root_values)) + + return position_index, position_prob + + def update_game_history(self, game_id, game_history): + # The element could have been removed since its selection and update + # if next(iter(self.buffer)) <= game_id: + # self.buffer[game_id] = game_history + + self.buffer[game_id] = game_history + + def compute_target_value(self, game_history, index): + # The value target is the discounted root value of the search tree td_steps into the + # future, plus the discounted sum of all rewards until then. + bootstrap_index = index + self.config.td_steps + if bootstrap_index < len(game_history.root_values): + root_values = ( + game_history.root_values + if game_history.reanalysed_predicted_root_values is None + else game_history.reanalysed_predicted_root_values + ) + last_step_value = ( + root_values[bootstrap_index] + if game_history.to_play_history[bootstrap_index] + == game_history.to_play_history[index] + else -root_values[bootstrap_index] + ) + + value = last_step_value * self.config.discount**self.config.td_steps + else: + value = 0 + + for i, reward in enumerate( + game_history.reward_history[index + 1 : bootstrap_index + 1] + ): + # The value is oriented from the perspective of the current player + value += ( + reward + if game_history.to_play_history[index] + == game_history.to_play_history[index + i] + else -reward + ) * self.config.discount**i + + return value + + def make_target(self, game_history, state_index): + """ + Generate targets for every unroll steps. + """ + target_values, target_rewards, target_policies, actions = [], [], [], [] + for current_index in range( + state_index, state_index + self.config.num_unroll_steps + 1 + ): + value = self.compute_target_value(game_history, current_index) + + if current_index < len(game_history.root_values): + target_values.append(value) + target_rewards.append(game_history.reward_history[current_index]) + target_policies.append(game_history.child_visits[current_index]) + actions.append(game_history.action_history[current_index]) + elif current_index == len(game_history.root_values): + target_values.append(0) + target_rewards.append(game_history.reward_history[current_index]) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(game_history.action_history[current_index]) + else: + # States past the end of games are treated as absorbing states + target_values.append(0) + target_rewards.append(0) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(numpy.random.choice(self.config.action_space)) + + return target_values, target_rewards, target_policies, actions + +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + # if initial_checkpoint["optimizer_state"] is not None: + # print("Loading optimizer...\n") + # self.optimizer.load_state_dict( + # copy.deepcopy(initial_checkpoint["optimizer_state"]) + # ) + + # # update weights 与 continuous update weights 的区别 + # # 1. update weights 是实际计算更新network的权重 + # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + # next_batch = play_buffer.get_batch() + # # Training loop + # while self.training_step < self.config.training_steps and not terminate: + # index_batch, batch = next_batch + # next_batch = play_buffer.get_batch() + # self.update_lr() + # ( + # priorities, + # total_loss, + # value_loss, + # reward_loss, + # policy_loss, + # ) = self.update_weights(batch) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss + +if __name__ == "__main__": + config = MuZeroConfig() + + checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + + trainer = Trainer(checkpoint, config) + selfplay = MySelfPlay(trainer.model, checkpoint, Game, config, config.seed) + buffer = {} + play_buffer = PlayBuffer(checkpoint, buffer, config) + for i in range(config.training_steps): + game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + # print(game_history.reward_history) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + # buffer[game_id] = game_history + + play_buffer.update_game_history(game_id, game_history) + + for i in range(10): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + trainer.update_weights(batch) + + selfplay.close_game() + + diff --git a/trainer.py b/trainer.py index faa5f941..3e035c51 100644 --- a/trainer.py +++ b/trainer.py @@ -66,7 +66,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage): next_batch = replay_buffer.get_batch.remote() # Training loop while self.training_step < self.config.training_steps and not ray.get( - shared_storage.get_info.remote("terminate") + shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 ): index_batch, batch = ray.get(next_batch) next_batch = replay_buffer.get_batch.remote() @@ -117,7 +117,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage): ) > self.config.ratio and self.training_step < self.config.training_steps - and not ray.get(shared_storage.get_info.remote("terminate")) + and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 ): time.sleep(0.5) From 98f8b05dffcaba7954b67b0371342b142ca40f58 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Wed, 16 Aug 2023 22:09:30 +0100 Subject: [PATCH 4/9] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MuZero_No_Replay_Buffer.py | 1260 -------------- game_tournament.py | 221 +++ games/tictactoe.py | 3 +- muzero_2net.py | 20 +- muzero_uniform.py | 719 ++++++++ muzero_without_replay_buffer.py | 1443 +++++++++-------- muzero_without_replay_buffer2.py | 417 +++++ muzero_without_replay_buffer_tictactoe.py | 242 +++ simplifiedMuZero/net2/trainer_2net.py | 2 - .../self_play_uniform_search.py} | 94 +- simplifiedMuZero/without_rb/game_play.py | 182 +++ .../models_without_replay_buffer.py | 696 -------- simplifiedMuZero/without_rb/play_buffer.py | 214 +++ ...er_without_replay_buffer.py => trainer.py} | 120 +- test/game_play_test.py | 10 +- test/mcts_test.py | 245 +++ test/muzero_config_test.py | 6 + trainer.py | 2 +- 18 files changed, 3140 insertions(+), 2756 deletions(-) delete mode 100644 MuZero_No_Replay_Buffer.py create mode 100644 game_tournament.py create mode 100644 muzero_uniform.py create mode 100644 muzero_without_replay_buffer2.py create mode 100644 muzero_without_replay_buffer_tictactoe.py rename simplifiedMuZero/{without_rb/self_play_without_replay_buffer.py => search_policy/self_play_uniform_search.py} (91%) create mode 100644 simplifiedMuZero/without_rb/game_play.py delete mode 100644 simplifiedMuZero/without_rb/models_without_replay_buffer.py create mode 100644 simplifiedMuZero/without_rb/play_buffer.py rename simplifiedMuZero/without_rb/{trainer_without_replay_buffer.py => trainer.py} (67%) create mode 100644 test/mcts_test.py create mode 100644 test/muzero_config_test.py diff --git a/MuZero_No_Replay_Buffer.py b/MuZero_No_Replay_Buffer.py deleted file mode 100644 index bf280c71..00000000 --- a/MuZero_No_Replay_Buffer.py +++ /dev/null @@ -1,1260 +0,0 @@ -import copy -import importlib -import json -import math -import pathlib -import pickle -import sys -import time - -import nevergrad -import numpy -import ray -import torch -from torch.utils.tensorboard import SummaryWriter - -import diagnose_model -# import simplifiedMuZero.without_rb.models_without_replay_buffer as models -import models -# import replay_buffer -# import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play -import shared_storage -# import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer -from self_play import MCTS, GameHistory -from muzero import load_model_menu, CPUActor - -# training_step是一个全局变量,用来存储现有的运行次数,不要超过游戏config里的training_steps,如30000次 - -class GamePlay: - """ - Class which run in a dedicated thread to play games and save them to the replay-buffer. - """ - - def __init__(self, initial_checkpoint, Game, config, seed): - self.config = config - self.game = Game(seed) - - # Fix random generator seed - numpy.random.seed(seed) - torch.manual_seed(seed) - - # Initialize the network - self.model = models.MuZeroNetwork(self.config) - self.model.set_weights(initial_checkpoint["weights"]) - self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) - self.model.eval() - self.trained_steps = initial_checkpoint["training_step"] - self.terminate = False - - def continuous_self_play(self, test_mode=False): - # def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): - while self.trained_steps < self.config.training_steps and not self.terminate: # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 - # 此处不要用set——weights,因为现在移除了replay_buffer,不需要shared_storage了 - self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 - - if not test_mode: - # game_history = self.play_game( - # self.config.visit_softmax_temperature_fn( - # trained_steps=ray.get( - # shared_storage.get_info.remote("training_step") - # ) - # ), - # self.config.temperature_threshold, - # False, - # "self", - # 0, - # ) - game_history = self.play_game( - self.config.visit_softmax_temperature_fn( - self.trained_steps - ), - self.config.temperature_threshold, - False, - "self", - 0, - ) - - # replay_buffer.save_game.remote(game_history, shared_storage) - return game_history - - else: - # Take the best action (no exploration) in test mode # 在测试模式下采取最佳行动(无探索) - game_history = self.play_game( - 0, - self.config.temperature_threshold, - False, - "self" if len(self.config.players) == 1 else self.config.opponent, - self.config.muzero_player, - ) - - # Save to the shared storage - shared_storage.set_info.remote( - { - "episode_length": len(game_history.action_history) - 1, - "total_reward": sum(game_history.reward_history), - "mean_value": numpy.mean( - [value for value in game_history.root_values if value] - ), - } - ) - if 1 < len(self.config.players): - shared_storage.set_info.remote( - { - "muzero_reward": sum( - reward - for i, reward in enumerate(game_history.reward_history) - if game_history.to_play_history[i - 1] - == self.config.muzero_player - ), - "opponent_reward": sum( - reward - for i, reward in enumerate(game_history.reward_history) - if game_history.to_play_history[i - 1] - != self.config.muzero_player - ), - } - ) - - # Managing the self-play / training ratio - if not test_mode and self.config.self_play_delay: - time.sleep(self.config.self_play_delay) - if not test_mode and self.config.ratio: - while ( - ray.get(shared_storage.get_info.remote("training_step")) - / max( - 1, ray.get(shared_storage.get_info.remote("num_played_steps")) - ) - < self.config.ratio - and ray.get(shared_storage.get_info.remote("training_step")) - < self.config.training_steps - and not ray.get(shared_storage.get_info.remote("terminate")) - ): - time.sleep(0.5) - - self.close_game() - - # play game 与continuous self play 的区别: - # 1. play game 是实际运行游戏,游戏的结果存在game history里,不向replay buffer里写 - # 2. continuous self play 调用play game,把获取到的game history 异步写进 replay buffer - #play game 运行 - # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 - # 运行步骤: - # 1. 创建GameHistory用来存储数据 - # 2. 检查游戏是否结束或者到底最大移动次数 - # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) - # 4. 运行MCTS搜索下一步的action - # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done - # 6. 持续运行2-5步直到结束 - # 7. 返回GameHistory - def play_game( - self, temperature, temperature_threshold, render, opponent, muzero_player - ): - """ - Play one game with actions based on the Monte Carlo tree search at each moves. - """ - game_history = GameHistory() - observation = self.game.reset() - game_history.action_history.append(0) - game_history.observation_history.append(observation) # 添加reset之后的observation - game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) - - done = False - - if render: - self.game.render() - - with torch.no_grad(): - while ( - not done and len(game_history.action_history) <= self.config.max_moves - ): # 游戏没有结束且运行步数小于最大移动步长 - assert ( - len(numpy.array(observation).shape) == 3 - ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" - assert ( - numpy.array(observation).shape == self.config.observation_shape - ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." - stacked_observations = game_history.get_stacked_observations( - -1, self.config.stacked_observations, len(self.config.action_space) - ) - # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 - # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 - - # 一下的if-else部分主要是为了选择一个动作 - # Choose the action - if opponent == "self" or muzero_player == self.game.to_play(): - root, mcts_info = MCTS(self.config).run( - self.model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 - True, - ) - action = self.select_action( - root, - temperature - if not temperature_threshold - or len(game_history.action_history) < temperature_threshold - else 0, - ) # 根据temperature选择动作 - - if render: - print(f'Tree depth: {mcts_info["max_tree_depth"]}') - print( - f"Root value for player {self.game.to_play()}: {root.value():.2f}" - ) - else: - action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 - opponent, stacked_observations - ) - - observation, reward, done = self.game.step(action) # 运行游戏 - - if render: - print(f"Played action: {self.game.action_to_string(action)}") - self.game.render() - - game_history.store_search_statistics(root, self.config.action_space) - - # Next batch - game_history.action_history.append(action) - game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 - game_history.reward_history.append(reward) - game_history.to_play_history.append(self.game.to_play()) - - return game_history - - def close_game(self): - self.game.close() - - def select_opponent_action(self, opponent, stacked_observations): - """ - Select opponent action for evaluating MuZero level. - """ - if opponent == "human": - root, mcts_info = MCTS(self.config).run( - self.model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), - True, - ) - print(f'Tree depth: {mcts_info["max_tree_depth"]}') - print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") - print( - f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" - ) - return self.game.human_to_action(), root - elif opponent == "expert": - return self.game.expert_agent(), None - elif opponent == "random": - assert ( - self.game.legal_actions() - ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." - assert set(self.game.legal_actions()).issubset( - set(self.config.action_space) - ), "Legal actions should be a subset of the action space." - - return numpy.random.choice(self.game.legal_actions()), None - else: - raise NotImplementedError( - 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' - ) - - # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 - # 公式为 c^(1/t)。可以看到: - # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 - # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 - # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 - @staticmethod # 静态方法修饰符,类似于static关键字 - def select_action(node, temperature): - """ - Select action according to the visit count distribution and the temperature. - The temperature is changed dynamically with the visit_softmax_temperature function - in the config. - """ - visit_counts = numpy.array( - [child.visit_count for child in node.children.values()], dtype="int32" - ) - actions = [action for action in node.children.keys()] - if temperature == 0: - action = actions[numpy.argmax(visit_counts)] - elif temperature == float("inf"): - action = numpy.random.choice(actions) - else: - # See paper appendix Data Generation - visit_count_distribution = visit_counts ** (1 / temperature) - visit_count_distribution = visit_count_distribution / sum( - visit_count_distribution - ) - action = numpy.random.choice(actions, p=visit_count_distribution) - - return action - -class Trainer_without_Replay_Buffer: - """ - Class which run in a dedicated thread to train a neural network and save it - in the shared storage. - """ - - def __init__(self, initial_checkpoint, config): - self.config = config - - # Fix random generator seed - numpy.random.seed(self.config.seed) - torch.manual_seed(self.config.seed) - - # Initialize the network - self.model = models.MuZeroNetwork(self.config) - self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) - self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) - self.model.train() - - self.training_step = initial_checkpoint["training_step"] - - if "cuda" not in str(next(self.model.parameters()).device): - print("You are not training on GPU.\n") - - # Initialize the optimizer - if self.config.optimizer == "SGD": - self.optimizer = torch.optim.SGD( - self.model.parameters(), - lr=self.config.lr_init, - momentum=self.config.momentum, - weight_decay=self.config.weight_decay, - ) - elif self.config.optimizer == "Adam": - self.optimizer = torch.optim.Adam( - self.model.parameters(), - lr=self.config.lr_init, - weight_decay=self.config.weight_decay, - ) - else: - raise NotImplementedError( - f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." - ) - - if initial_checkpoint["optimizer_state"] is not None: - print("Loading optimizer...\n") - self.optimizer.load_state_dict( - copy.deepcopy(initial_checkpoint["optimizer_state"]) - ) - - # update weights 与 continuous update weights 的区别 - # 1. update weights 是实际计算更新network的权重 - # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 - def continuous_update_weights(self, replay_buffer, shared_storage): - # Wait for the replay buffer to be filled - while ray.get(shared_storage.get_info.remote("num_played_games")) < 1: - time.sleep(0.1) - - next_batch = replay_buffer.get_batch.remote() - # Training loop - while self.training_step < self.config.training_steps and not ray.get( - shared_storage.get_info.remote("terminate") - ): - index_batch, batch = ray.get(next_batch) - next_batch = replay_buffer.get_batch.remote() - self.update_lr() - ( - priorities, - total_loss, - value_loss, - reward_loss, - policy_loss, - ) = self.update_weights(batch) - - if self.config.PER: - # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933) - replay_buffer.update_priorities.remote(priorities, index_batch) - - # Save to the shared storage - if self.training_step % self.config.checkpoint_interval == 0: - shared_storage.set_info.remote( - { - "weights": copy.deepcopy(self.model.get_weights()), - "optimizer_state": copy.deepcopy( - models.dict_to_cpu(self.optimizer.state_dict()) - ), - } - ) - if self.config.save_model: - shared_storage.save_checkpoint.remote() - shared_storage.set_info.remote( - { - "training_step": self.training_step, - "lr": self.optimizer.param_groups[0]["lr"], - "total_loss": total_loss, - "value_loss": value_loss, - "reward_loss": reward_loss, - "policy_loss": policy_loss, - } - ) - - # Managing the self-play / training ratio - if self.config.training_delay: - time.sleep(self.config.training_delay) - if self.config.ratio: - while ( - self.training_step - / max( - 1, ray.get(shared_storage.get_info.remote("num_played_steps")) - ) - > self.config.ratio - and self.training_step < self.config.training_steps - and not ray.get(shared_storage.get_info.remote("terminate")) - ): - time.sleep(0.5) - - def update_weights(self, batch): - """ - Perform one training step. - """ - - ( - observation_batch, - action_batch, - target_value, - target_reward, - target_policy, - weight_batch, - gradient_scale_batch, - ) = batch - - # Keep values as scalars for calculating the priorities for the prioritized replay - target_value_scalar = numpy.array(target_value, dtype="float32") - priorities = numpy.zeros_like(target_value_scalar) - - device = next(self.model.parameters()).device - if self.config.PER: - weight_batch = torch.tensor(weight_batch.copy()).float().to(device) - observation_batch = ( - torch.tensor(numpy.array(observation_batch)).float().to(device) - ) - action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) - target_value = torch.tensor(target_value).float().to(device) - target_reward = torch.tensor(target_reward).float().to(device) - target_policy = torch.tensor(target_policy).float().to(device) - gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) - # observation_batch: batch, channels, height, width - # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) - # target_value: batch, num_unroll_steps+1 - # target_reward: batch, num_unroll_steps+1 - # target_policy: batch, num_unroll_steps+1, len(action_space) - # gradient_scale_batch: batch, num_unroll_steps+1 - - target_value = models.scalar_to_support(target_value, self.config.support_size) - target_reward = models.scalar_to_support( - target_reward, self.config.support_size - ) - # target_value: batch, num_unroll_steps+1, 2*support_size+1 - # target_reward: batch, num_unroll_steps+1, 2*support_size+1 - - ## Generate predictions - value, reward, policy_logits, hidden_state = self.model.initial_inference( - observation_batch - ) - predictions = [(value, reward, policy_logits)] - for i in range(1, action_batch.shape[1]): - value, reward, policy_logits, hidden_state = self.model.recurrent_inference( - hidden_state, action_batch[:, i] - ) - # Scale the gradient at the start of the dynamics function (See paper appendix Training) - hidden_state.register_hook(lambda grad: grad * 0.5) - predictions.append((value, reward, policy_logits)) - # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) - - ## Compute losses - value_loss, reward_loss, policy_loss = (0, 0, 0) - value, reward, policy_logits = predictions[0] - # Ignore reward loss for the first batch step - current_value_loss, _, current_policy_loss = self.loss_function( - value.squeeze(-1), - reward.squeeze(-1), - policy_logits, - target_value[:, 0], - target_reward[:, 0], - target_policy[:, 0], - ) - value_loss += current_value_loss - policy_loss += current_policy_loss - # Compute priorities for the prioritized replay (See paper appendix Training) - pred_value_scalar = ( - models.support_to_scalar(value, self.config.support_size) - .detach() - .cpu() - .numpy() - .squeeze() - ) - priorities[:, 0] = ( - numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) - ** self.config.PER_alpha - ) - - for i in range(1, len(predictions)): - value, reward, policy_logits = predictions[i] - ( - current_value_loss, - current_reward_loss, - current_policy_loss, - ) = self.loss_function( - value.squeeze(-1), - reward.squeeze(-1), - policy_logits, - target_value[:, i], - target_reward[:, i], - target_policy[:, i], - ) - - # Scale gradient by the number of unroll steps (See paper appendix Training) - current_value_loss.register_hook( - lambda grad: grad / gradient_scale_batch[:, i] - ) - current_reward_loss.register_hook( - lambda grad: grad / gradient_scale_batch[:, i] - ) - current_policy_loss.register_hook( - lambda grad: grad / gradient_scale_batch[:, i] - ) - - value_loss += current_value_loss - reward_loss += current_reward_loss - policy_loss += current_policy_loss - - # Compute priorities for the prioritized replay (See paper appendix Training) - pred_value_scalar = ( - models.support_to_scalar(value, self.config.support_size) - .detach() - .cpu() - .numpy() - .squeeze() - ) - priorities[:, i] = ( - numpy.abs(pred_value_scalar - target_value_scalar[:, i]) - ** self.config.PER_alpha - ) - - # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) - loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss - if self.config.PER: - # Correct PER bias by using importance-sampling (IS) weights - loss *= weight_batch - # Mean over batch dimension (pseudocode do a sum) - loss = loss.mean() - - # Optimize - self.optimizer.zero_grad() - loss.backward() - self.optimizer.step() - # 此处才算一次迭代完成,training step加1 - self.training_step += 1 - - return ( - priorities, - # For log purpose - loss.item(), - value_loss.mean().item(), - reward_loss.mean().item(), - policy_loss.mean().item(), - ) - - def update_lr(self): - """ - Update learning rate - """ - lr = self.config.lr_init * self.config.lr_decay_rate ** ( - self.training_step / self.config.lr_decay_steps - ) - for param_group in self.optimizer.param_groups: - param_group["lr"] = lr - - @staticmethod - def loss_function( - value, - reward, - policy_logits, - target_value, - target_reward, - target_policy, - ): - # Cross-entropy seems to have a better convergence than MSE - value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) - reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) - policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( - 1 - ) - return value_loss, reward_loss, policy_loss - -class MuZero_No_Replay_Buffer: - """ - Main class to manage MuZero. - - Args: - game_name (str): Name of the game module, it should match the name of a .py file - in the "./games" directory. - - config (dict, MuZeroConfig, optional): Override the default config of the game. - - split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. - - Example: - >>> muzero = MuZero_No_Replay_Buffer("cartpole") - >>> muzero.train() - >>> muzero.test(render=True) - """ - - def __init__(self, game_name, config=None, split_resources_in=1): - # Load the game and the config from the module with the game name - try: - game_module = importlib.import_module("games." + game_name) - print("games." + game_name) - self.Game = game_module.Game - self.config = game_module.MuZeroConfig() - except ModuleNotFoundError as err: - print( - f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' - ) - raise err - - # Overwrite the config - if config: - if type(config) is dict: - for param, value in config.items(): - if hasattr(self.config, param): - setattr(self.config, param, value) - else: - raise AttributeError( - f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." - ) - else: - self.config = config - - # Fix random generator seed - numpy.random.seed(self.config.seed) - torch.manual_seed(self.config.seed) - - # Manage GPUs - if self.config.max_num_gpus == 0 and ( - self.config.selfplay_on_gpu - or self.config.train_on_gpu - or self.config.reanalyse_on_gpu - ): - raise ValueError( - "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." - ) - if ( - self.config.selfplay_on_gpu - or self.config.train_on_gpu - or self.config.reanalyse_on_gpu - ): - total_gpus = ( - self.config.max_num_gpus - if self.config.max_num_gpus is not None - else torch.cuda.device_count() - ) - else: - total_gpus = 0 - self.num_gpus = total_gpus / split_resources_in - if 1 < self.num_gpus: - self.num_gpus = math.floor(self.num_gpus) - - ray.init(num_gpus=total_gpus, ignore_reinit_error=True) - - # Checkpoint and replay buffer used to initialize workers - self.checkpoint = { - "weights": None, - "optimizer_state": None, - "total_reward": 0, - "muzero_reward": 0, - "opponent_reward": 0, - "episode_length": 0, - "mean_value": 0, - "training_step": 0, - "lr": 0, - "total_loss": 0, - "value_loss": 0, - "reward_loss": 0, - "policy_loss": 0, - "num_played_games": 0, - "num_played_steps": 0, - "num_reanalysed_games": 0, - "terminate": False, - } - self.replay_buffer = {} - - # cpu_actor = CPUActor.remote() - # cpu_weights = cpu_actor.get_initial_weights.remote(self.config) - # 移除ray - cpu_actor = CPUActor() - cpu_weights = cpu_actor.get_initial_weights(self.config) - self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) - - # Workers - self.self_play_workers = None - self.test_worker = None - self.training_worker = None - self.reanalyse_worker = None - self.replay_buffer_worker = None - self.shared_storage_worker = None - - def train(self, log_in_tensorboard=True): - """ - Spawn ray workers and launch the training. - - Args: - log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. - """ - if log_in_tensorboard or self.config.save_model: - self.config.results_path.mkdir(parents=True, exist_ok=True) - - # Manage GPUs - if 0 < self.num_gpus: - num_gpus_per_worker = self.num_gpus / ( - self.config.train_on_gpu - + self.config.num_workers * self.config.selfplay_on_gpu - + log_in_tensorboard * self.config.selfplay_on_gpu - + self.config.use_last_model_value * self.config.reanalyse_on_gpu - ) - if 1 < num_gpus_per_worker: - num_gpus_per_worker = math.floor(num_gpus_per_worker) - else: - num_gpus_per_worker = 0 - - # Initialize workers - # self.training_worker = trainer.Trainer.options( - # num_cpus=0, - # num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, - # ).remote(self.checkpoint, self.config) - # - # self.shared_storage_worker = shared_storage.SharedStorage.remote( - # self.checkpoint, - # self.config, - # ) - # self.shared_storage_worker.set_info.remote("terminate", False) - # - # self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( - # self.checkpoint, self.replay_buffer, self.config - # ) - - # 初始化权重 - self.training_worker = Trainer_without_Replay_Buffer(self.checkpoint, self.config) - - # #使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) - # if self.config.use_last_model_value: - # self.reanalyse_worker = replay_buffer.Reanalyse.options( - # num_cpus=0, - # num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, - # ).remote(self.checkpoint, self.config) - # - # self.self_play_workers = [ - # self_play.SelfPlay.options( - # num_cpus=0, - # num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, - # ).remote( - # self.checkpoint, - # self.Game, - # self.config, - # self.config.seed + seed, - # ) - # for seed in range(self.config.num_workers) - # ] - # - # # 这里调用continuous类的函数,主要是continuous函数会调用replay_buffer, - # - # # Launch workers - # # 此处调用worker进行self play,把结果存在replay_buffer里 - # [ - # self_play_worker.continuous_self_play.remote( - # self.shared_storage_worker, self.replay_buffer_worker - # ) - # for self_play_worker in self.self_play_workers - # ] - - # # 此处使用trainer,从replay buffer里按batch抽取数据,进行网络训练和更新 - # self.training_worker.continuous_update_weights.remote( - # self.replay_buffer_worker, self.shared_storage_worker - # ) - self.training_worker.continuous_update_weights(self.replay_buffer_worker, self.shared_storage_worker) - - # # 使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) - # if self.config.use_last_model_value: - # self.reanalyse_worker.reanalyse.remote( - # self.replay_buffer_worker, self.shared_storage_worker - # ) - - if log_in_tensorboard: - self.logging_loop( - num_gpus_per_worker if self.config.selfplay_on_gpu else 0, - ) - - def logging_loop(self, num_gpus): - """ - Keep track of the training performance. - """ - # Launch the test worker to get performance metrics - self.test_worker = self_play.SelfPlay.options( - num_cpus=0, - num_gpus=num_gpus, - ).remote( - self.checkpoint, - self.Game, - self.config, - self.config.seed + self.config.num_workers, - ) - self.test_worker.continuous_self_play.remote( - self.shared_storage_worker, None, True - ) - - # Write everything in TensorBoard - writer = SummaryWriter(self.config.results_path) - - print( - "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" - ) - - # Save hyperparameters to TensorBoard - hp_table = [ - f"| {key} | {value} |" for key, value in self.config.__dict__.items() - ] - writer.add_text( - "Hyperparameters", - "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), - ) - # Save model representation - writer.add_text( - "Model summary", - self.summary, - ) - # Loop for updating the training performance - counter = 0 - keys = [ - "total_reward", - "muzero_reward", - "opponent_reward", - "episode_length", - "mean_value", - "training_step", - "lr", - "total_loss", - "value_loss", - "reward_loss", - "policy_loss", - "num_played_games", - "num_played_steps", - "num_reanalysed_games", - ] - info = ray.get(self.shared_storage_worker.get_info.remote(keys)) - try: - while info["training_step"] < self.config.training_steps: - info = ray.get(self.shared_storage_worker.get_info.remote(keys)) - writer.add_scalar( - "1.Total_reward/1.Total_reward", - info["total_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/2.Mean_value", - info["mean_value"], - counter, - ) - writer.add_scalar( - "1.Total_reward/3.Episode_length", - info["episode_length"], - counter, - ) - writer.add_scalar( - "1.Total_reward/4.MuZero_reward", - info["muzero_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/5.Opponent_reward", - info["opponent_reward"], - counter, - ) - writer.add_scalar( - "2.Workers/1.Self_played_games", - info["num_played_games"], - counter, - ) - writer.add_scalar( - "2.Workers/2.Training_steps", info["training_step"], counter - ) - writer.add_scalar( - "2.Workers/3.Self_played_steps", info["num_played_steps"], counter - ) - writer.add_scalar( - "2.Workers/4.Reanalysed_games", - info["num_reanalysed_games"], - counter, - ) - writer.add_scalar( - "2.Workers/5.Training_steps_per_self_played_step_ratio", - info["training_step"] / max(1, info["num_played_steps"]), - counter, - ) - writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) - writer.add_scalar( - "3.Loss/1.Total_weighted_loss", info["total_loss"], counter - ) - writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) - writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) - writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) - print( - f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', - end="\r", - ) - counter += 1 - time.sleep(0.5) - except KeyboardInterrupt: - pass - - self.terminate_workers() - - if self.config.save_model: - # Persist replay buffer to disk - path = self.config.results_path / "replay_buffer.pkl" - print(f"\n\nPersisting replay buffer games to disk at {path}") - # 此处是将replay buffer的结果写入文件保持 - pickle.dump( - { - "buffer": self.replay_buffer, - "num_played_games": self.checkpoint["num_played_games"], - "num_played_steps": self.checkpoint["num_played_steps"], - "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], - }, - open(path, "wb"), - ) - - def terminate_workers(self): - """ - Softly terminate the running tasks and garbage collect the workers. - """ - if self.shared_storage_worker: - self.shared_storage_worker.set_info.remote("terminate", True) - self.checkpoint = ray.get( - self.shared_storage_worker.get_checkpoint.remote() - ) - if self.replay_buffer_worker: - self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) - - print("\nShutting down workers...") - - self.self_play_workers = None - self.test_worker = None - self.training_worker = None - self.reanalyse_worker = None - self.replay_buffer_worker = None - self.shared_storage_worker = None - - def test( - self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 - ): - """ - Test the model in a dedicated thread. - - Args: - render (bool): To display or not the environment. Defaults to True. - - opponent (str): "self" for self-play, "human" for playing against MuZero and "random" - for a random agent, None will use the opponent in the config. Defaults to None. - - muzero_player (int): Player number of MuZero in case of multiplayer - games, None let MuZero play all players turn by turn, None will use muzero_player in - the config. Defaults to None. - - num_tests (int): Number of games to average. Defaults to 1. - - num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. - """ - opponent = opponent if opponent else self.config.opponent - muzero_player = muzero_player if muzero_player else self.config.muzero_player - self_play_worker = self_play.SelfPlay.options( - num_cpus=0, - num_gpus=num_gpus, - ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) - results = [] - for i in range(num_tests): - print(f"Testing {i+1}/{num_tests}") - results.append( - ray.get( - self_play_worker.play_game.remote( - 0, - 0, - render, - opponent, - muzero_player, - ) - ) - ) - self_play_worker.close_game.remote() - - if len(self.config.players) == 1: - result = numpy.mean([sum(history.reward_history) for history in results]) - else: - result = numpy.mean( - [ - sum( - reward - for i, reward in enumerate(history.reward_history) - if history.to_play_history[i - 1] == muzero_player - ) - for history in results - ] - ) - return result - - def load_model(self, checkpoint_path=None, replay_buffer_path=None): - """ - Load a model and/or a saved replay buffer. - - Args: - checkpoint_path (str): Path to model.checkpoint or model.weights. - - replay_buffer_path (str): Path to replay_buffer.pkl - """ - # Load checkpoint - if checkpoint_path: - checkpoint_path = pathlib.Path(checkpoint_path) - self.checkpoint = torch.load(checkpoint_path) - print(f"\nUsing checkpoint from {checkpoint_path}") - - # Load replay buffer - if replay_buffer_path: - replay_buffer_path = pathlib.Path(replay_buffer_path) - # pickle用来存储和导入文件,其作用是将对象转换为字符串或者将字符串转换为对象 - with open(replay_buffer_path, "rb") as f: - replay_buffer_infos = pickle.load(f) - # 此处更新replay buffer的值 - self.replay_buffer = replay_buffer_infos["buffer"] - self.checkpoint["num_played_steps"] = replay_buffer_infos[ - "num_played_steps" - ] - self.checkpoint["num_played_games"] = replay_buffer_infos[ - "num_played_games" - ] - self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ - "num_reanalysed_games" - ] - - print(f"\nInitializing replay buffer with {replay_buffer_path}") - else: - print(f"Using empty buffer.") - self.replay_buffer = {} - self.checkpoint["training_step"] = 0 - self.checkpoint["num_played_steps"] = 0 - self.checkpoint["num_played_games"] = 0 - self.checkpoint["num_reanalysed_games"] = 0 - - def diagnose_model(self, horizon): - """ - Play a game only with the learned model then play the same trajectory in the real - environment and display information. - - Args: - horizon (int): Number of timesteps for which we collect information. - """ - game = self.Game(self.config.seed) - obs = game.reset() - dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) - dm.compare_virtual_with_real_trajectories(obs, game, horizon) - input("Press enter to close all plots") - dm.close_all() - - -# @ray.remote(num_cpus=0, num_gpus=0) -# class CPUActor: -# # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU -# def __init__(self): -# pass -# -# def get_initial_weights(self, config): -# model = models.MuZeroNetwork(config) -# weigths = model.get_weights() -# summary = str(model).replace("\n", " \n\n") -# return weigths, summary - - -def hyperparameter_search( - game_name, parametrization, budget, parallel_experiments, num_tests -): - """ - Search for hyperparameters by launching parallel experiments. - - Args: - game_name (str): Name of the game module, it should match the name of a .py file - in the "./games" directory. - - parametrization : Nevergrad parametrization, please refer to nevergrad documentation. - - budget (int): Number of experiments to launch in total. - - parallel_experiments (int): Number of experiments to launch in parallel. - - num_tests (int): Number of games to average for evaluating an experiment. - """ - optimizer = nevergrad.optimizers.OnePlusOne( - parametrization=parametrization, budget=budget - ) - - running_experiments = [] - best_training = None - try: - # Launch initial experiments - for i in range(parallel_experiments): - if 0 < budget: - param = optimizer.ask() - print(f"Launching new experiment: {param.value}") - muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments) - muzero.param = param - muzero.train(False) - running_experiments.append(muzero) - budget -= 1 - - while 0 < budget or any(running_experiments): - for i, experiment in enumerate(running_experiments): - if experiment and experiment.config.training_steps <= ray.get( - experiment.shared_storage_worker.get_info.remote("training_step") - ): - experiment.terminate_workers() - result = experiment.test(False, num_tests=num_tests) - if not best_training or best_training["result"] < result: - best_training = { - "result": result, - "config": experiment.config, - "checkpoint": experiment.checkpoint, - } - print(f"Parameters: {experiment.param.value}") - print(f"Result: {result}") - optimizer.tell(experiment.param, -result) - - if 0 < budget: - param = optimizer.ask() - print(f"Launching new experiment: {param.value}") - muzero = MuZero_No_Replay_Buffer(game_name, param.value, parallel_experiments) - muzero.param = param - muzero.train(False) - running_experiments[i] = muzero - budget -= 1 - else: - running_experiments[i] = None - - except KeyboardInterrupt: - for experiment in running_experiments: - if isinstance(experiment, MuZero_No_Replay_Buffer): - experiment.terminate_workers() - - recommendation = optimizer.provide_recommendation() - print("Best hyperparameters:") - print(recommendation.value) - if best_training: - # Save best training weights (but it's not the recommended weights) - best_training["config"].results_path.mkdir(parents=True, exist_ok=True) - torch.save( - best_training["checkpoint"], - best_training["config"].results_path / "model.checkpoint", - ) - # Save the recommended hyperparameters - text_file = open( - best_training["config"].results_path / "best_parameters.txt", - "w", - ) - text_file.write(str(recommendation.value)) - text_file.close() - return recommendation.value - - -if __name__ == "__main__": - if len(sys.argv) == 2: - # Train directly with: python muzero.py cartpole - muzero = MuZero_No_Replay_Buffer(sys.argv[1]) - muzero.train() - elif len(sys.argv) == 3: - # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' - config = json.loads(sys.argv[2]) - muzero = MuZero_No_Replay_Buffer(sys.argv[1], config) - muzero.train() - else: - print("\nWelcome to MuZero! Here's a list of games:") - # Let user pick a game - games = [ - filename.stem - for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) - if filename.name != "abstract_game.py" - ] - for i in range(len(games)): - print(f"{i}. {games[i]}") - choice = input("Enter a number to choose the game: ") - valid_inputs = [str(i) for i in range(len(games))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - - # Initialize MuZero - choice = int(choice) - game_name = games[choice] - muzero = MuZero_No_Replay_Buffer(game_name) - - while True: - # Configure running options - options = [ - "Train", - "Load pretrained model", - "Diagnose model", - "Render some self play games", - "Play against MuZero", - "Test the game manually", - "Hyperparameter search", - "Exit", - ] - print() - for i in range(len(options)): - print(f"{i}. {options[i]}") - - choice = input("Enter a number to choose an action: ") - valid_inputs = [str(i) for i in range(len(options))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - choice = int(choice) - if choice == 0: - start_time = time.time() - muzero.train() - end_time = time.time() - print("耗时: {:.2f}秒".format(end_time - start_time)) - elif choice == 1: - load_model_menu(muzero, game_name) - elif choice == 2: - muzero.diagnose_model(30) - elif choice == 3: - muzero.test(render=True, opponent="self", muzero_player=None) - elif choice == 4: - muzero.test(render=True, opponent="human", muzero_player=0) - elif choice == 5: - env = muzero.Game() - env.reset() - env.render() - - done = False - while not done: - action = env.human_to_action() - observation, reward, done = env.step(action) - print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") - env.render() - elif choice == 6: - # Define here the parameters to tune - # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html - muzero.terminate_workers() - del muzero - budget = 20 - parallel_experiments = 2 - lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) - discount = nevergrad.p.Log(lower=0.95, upper=0.9999) - parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) - best_hyperparameters = hyperparameter_search( - game_name, parametrization, budget, parallel_experiments, 20 - ) - muzero = MuZero_No_Replay_Buffer(game_name, best_hyperparameters) - else: - break - print("\nDone") - - ray.shutdown() diff --git a/game_tournament.py b/game_tournament.py new file mode 100644 index 00000000..918beac3 --- /dev/null +++ b/game_tournament.py @@ -0,0 +1,221 @@ +import pickle + +import torch +import copy +import numpy + +from games.tictactoe import MuZeroConfig, Game +import models +from self_play import MCTS, GameHistory,SelfPlay +from simplifiedMuZero.search_policy.self_play_uniform_search import UniformSearch + +class GameTournament: + def __init__(self, config:MuZeroConfig): + self.models = [] + self.game = Game(config.seed) + self.config = config + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 0 + + def have_winner(self): + # Horizontal and vertical checks + for i in range(3): + if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + + # Diagonal checks + if ( + self.board[0, 0] == self.player + and self.board[1, 1] == self.player + and self.board[2, 2] == self.player + ): + return True + if ( + self.board[2, 0] == self.player + and self.board[1, 1] == self.player + and self.board[0, 2] == self.player + ): + return True + + return False + + def play_competition(self, model1, search_policy1, model2, search_policy2): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model1.eval() + model2.eval() + + is_model1 = True + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + model = model1 if is_model1 else model2 + search_policy = search_policy1 if is_model1 else search_policy2 + + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model1 = not is_model1 + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model1 == (reward > 0) + + def close_game(self): + self.game.close() + + def play_tournament(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model1 = game_tournament.play_competition(model1, MCTS, model2, MCTS) + + if have_winner: + if is_model1: + model1_win_num += 1 + else: + model2_win_num += 1 + else: + no_winner_num += 1 + + # 交换顺序,再来一遍 + for _ in range(rollnum): + have_winner, is_model1 = game_tournament.play_competition(model2, MCTS, model1, MCTS) + + if have_winner: + if is_model1: + model2_win_num += 1 + else: + model1_win_num += 1 + else: + no_winner_num += 1 + + # print(is_model1) + + print(models[i]["name"]," ,", models[j]["name"]," : ") + + print(models[i]["name"], " win : ", model1_win_num) + print(models[j]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + + +def load_model(model_cls, model_path): + checkpoint = torch.load(model_path) + model = model_cls(config) + model.set_weights(checkpoint["weights"]) + + return model + + +if __name__ == "__main__": + config = MuZeroConfig() + + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" + muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1) + + muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path) + + uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" + uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path) + + without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" + without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path) + + game_tournament = GameTournament(config) + + models = [ + {"name":"muzero_2net", "model":muzero_2net_model}, + {"name":"uniform", "model":uniform_model}, + {"name":"muzero", "model":muzero_model}, + {"name": "without_rb", "model": without_rb_model}, + ] + + # rollnum = 1000 + # + # # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + # model1_win_num = 0 + # model2_win_num = 0 + # no_winner_num = 0 + # + # for i in range(rollnum): + # have_winner, is_model1 = game_tournament.play_competition(muzero_2net_model, MCTS, uniform_model, MCTS) + # + # if have_winner: + # if is_model1: + # model1_win_num += 1 + # else: + # model2_win_num += 1 + # else: + # no_winner_num += 1 + # + # # print(is_model1) + # + # print(model1_win_num) + # print(model2_win_num) + # print(no_winner_num) + + game_tournament.play_tournament(models, rollnum=100) + + game_tournament.close_game() + + + # print(checkpoint) diff --git a/games/tictactoe.py b/games/tictactoe.py index f331a9ae..c2529d5d 100644 --- a/games/tictactoe.py +++ b/games/tictactoe.py @@ -75,7 +75,8 @@ def __init__(self): ### Training self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs self.save_model = True # Save the checkpoint in results_path as model.checkpoint - self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) + # self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) + self.training_steps = 50000 self.batch_size = 64 # Number of parts of games to train on at each training step self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) diff --git a/muzero_2net.py b/muzero_2net.py index d03457ec..39438acd 100644 --- a/muzero_2net.py +++ b/muzero_2net.py @@ -23,7 +23,7 @@ import simplifiedMuZero.net2.trainer_2net as trainer -class MuZero: +class MuZero_2Net: """ Main class to manage MuZero. @@ -36,7 +36,7 @@ class MuZero: split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. Example: - >>> muzero = MuZero("cartpole") + >>> muzero = MuZero_2Net("cartpole") >>> muzero.train() >>> muzero.test(render=True) """ @@ -67,6 +67,8 @@ def __init__(self, game_name, config=None, split_resources_in=1): else: self.config = config + # 重命名路径,以便区分不同的模型 + self.config.results_path /= "muzero_2net" # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) @@ -525,7 +527,7 @@ def hyperparameter_search( if 0 < budget: param = optimizer.ask() print(f"Launching new experiment: {param.value}") - muzero = MuZero(game_name, param.value, parallel_experiments) + muzero = MuZero_2Net(game_name, param.value, parallel_experiments) muzero.param = param muzero.train(False) running_experiments.append(muzero) @@ -551,7 +553,7 @@ def hyperparameter_search( if 0 < budget: param = optimizer.ask() print(f"Launching new experiment: {param.value}") - muzero = MuZero(game_name, param.value, parallel_experiments) + muzero = MuZero_2Net(game_name, param.value, parallel_experiments) muzero.param = param muzero.train(False) running_experiments[i] = muzero @@ -561,7 +563,7 @@ def hyperparameter_search( except KeyboardInterrupt: for experiment in running_experiments: - if isinstance(experiment, MuZero): + if isinstance(experiment, MuZero_2Net): experiment.terminate_workers() recommendation = optimizer.provide_recommendation() @@ -625,12 +627,12 @@ def load_model_menu(muzero, game_name): if __name__ == "__main__": if len(sys.argv) == 2: # Train directly with: python muzero.py cartpole - muzero = MuZero(sys.argv[1]) + muzero = MuZero_2Net(sys.argv[1]) muzero.train() elif len(sys.argv) == 3: # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' config = json.loads(sys.argv[2]) - muzero = MuZero(sys.argv[1], config) + muzero = MuZero_2Net(sys.argv[1], config) muzero.train() else: print("\nWelcome to MuZero! Here's a list of games:") @@ -650,7 +652,7 @@ def load_model_menu(muzero, game_name): # Initialize MuZero choice = int(choice) game_name = games[choice] - muzero = MuZero(game_name) + muzero = MuZero_2Net(game_name) while True: # Configure running options @@ -710,7 +712,7 @@ def load_model_menu(muzero, game_name): best_hyperparameters = hyperparameter_search( game_name, parametrization, budget, parallel_experiments, 20 ) - muzero = MuZero(game_name, best_hyperparameters) + muzero = MuZero_2Net(game_name, best_hyperparameters) else: break print("\nDone") diff --git a/muzero_uniform.py b/muzero_uniform.py new file mode 100644 index 00000000..24a9e09b --- /dev/null +++ b/muzero_uniform.py @@ -0,0 +1,719 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +import diagnose_model +import models +import replay_buffer +import simplifiedMuZero.search_policy.self_play_uniform_search as self_play +import shared_storage +import trainer + + +class MuZero_uniform: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero_uniform("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # 重命名路径,以便区分不同的模型 + self.config.results_path /= "muzero_uniform" + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.MuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_uniform(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_uniform(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero_uniform): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero_uniform(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero_uniform(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero_uniform(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero_uniform(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py index e0a63690..2eba36a0 100644 --- a/muzero_without_replay_buffer.py +++ b/muzero_without_replay_buffer.py @@ -1,723 +1,870 @@ -import copy -import importlib -import json -import math -import pathlib -import pickle -import sys -import time +from self_play import MCTS, GameHistory +from games.simple_grid import MuZeroConfig, Game +# from games.tictactoe import MuZeroConfig, Game +import models -import nevergrad import numpy -import ray import torch from torch.utils.tensorboard import SummaryWriter +import pickle -import diagnose_model -import simplifiedMuZero.without_rb.models_without_replay_buffer as models -# import replay_buffer -import simplifiedMuZero.without_rb.self_play_without_replay_buffer as self_play -import shared_storage -import simplifiedMuZero.without_rb.trainer_without_replay_buffer as trainer - +import math +import time +import copy -class MuZero_Without_Replay_Buffer: +class GamePlay: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. """ - Main class to manage MuZero. - Args: - game_name (str): Name of the game module, it should match the name of a .py file - in the "./games" directory. + def __init__(self, model, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) - config (dict, MuZeroConfig, optional): Override the default config of the game. + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + # self.model = models.MuZeroNetwork(self.config) + # self.model.set_weights(initial_checkpoint["weights"]) + self.model = model + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + self.trained_steps = initial_checkpoint["training_step"] + self.terminate = False + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + game_id = None + + if render: + self.game.render() + + game_id = self.game.to_play() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = self.select_action( + root, + temperature + if not temperature_threshold + or len(game_history.action_history) < temperature_threshold + else 0, + ) # 根据temperature选择动作 + + if render: + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print( + f"Root value for player {self.game.to_play()}: {root.value():.2f}" + ) + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) - split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + observation, reward, done = self.game.step(action) # 运行游戏 - Example: - >>> muzero = MuZero_Without_Replay_Buffer("cartpole") - >>> muzero.train() - >>> muzero.test(render=True) - """ + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() - def __init__(self, game_name, config=None, split_resources_in=1): - # Load the game and the config from the module with the game name - try: - game_module = importlib.import_module("games." + game_name) - print("games." + game_name) - self.Game = game_module.Game - self.config = game_module.MuZeroConfig() - except ModuleNotFoundError as err: - print( - f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' - ) - raise err - - # Overwrite the config - if config: - if type(config) is dict: - for param, value in config.items(): - if hasattr(self.config, param): - setattr(self.config, param, value) - else: - raise AttributeError( - f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." - ) - else: - self.config = config + game_history.store_search_statistics(root, self.config.action_space) - # Fix random generator seed - numpy.random.seed(self.config.seed) - torch.manual_seed(self.config.seed) + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) - # Manage GPUs - if self.config.max_num_gpus == 0 and ( - self.config.selfplay_on_gpu - or self.config.train_on_gpu - or self.config.reanalyse_on_gpu - ): - raise ValueError( - "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + return game_id, game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + True, ) - if ( - self.config.selfplay_on_gpu - or self.config.train_on_gpu - or self.config.reanalyse_on_gpu - ): - total_gpus = ( - self.config.max_num_gpus - if self.config.max_num_gpus is not None - else torch.cuda.device_count() + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") + print( + f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" ) + return self.game.human_to_action(), root + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None else: - total_gpus = 0 - self.num_gpus = total_gpus / split_resources_in - if 1 < self.num_gpus: - self.num_gpus = math.floor(self.num_gpus) - - ray.init(num_gpus=total_gpus, ignore_reinit_error=True) - - # Checkpoint and replay buffer used to initialize workers - self.checkpoint = { - "weights": None, - "optimizer_state": None, - "total_reward": 0, - "muzero_reward": 0, - "opponent_reward": 0, - "episode_length": 0, - "mean_value": 0, - "training_step": 0, - "lr": 0, - "total_loss": 0, - "value_loss": 0, - "reward_loss": 0, - "policy_loss": 0, - "num_played_games": 0, - "num_played_steps": 0, - "num_reanalysed_games": 0, - "terminate": False, - } - self.replay_buffer = {} - - cpu_actor = CPUActor.remote() - cpu_weights = cpu_actor.get_initial_weights.remote(self.config) - self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) - - # Workers - self.self_play_workers = None - self.test_worker = None - self.training_worker = None - self.reanalyse_worker = None - self.replay_buffer_worker = None - self.shared_storage_worker = None - - def train(self, log_in_tensorboard=True): - """ - Spawn ray workers and launch the training. + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) - Args: - log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. """ - if log_in_tensorboard or self.config.save_model: - self.config.results_path.mkdir(parents=True, exist_ok=True) - - # Manage GPUs - if 0 < self.num_gpus: - num_gpus_per_worker = self.num_gpus / ( - self.config.train_on_gpu - + self.config.num_workers * self.config.selfplay_on_gpu - + log_in_tensorboard * self.config.selfplay_on_gpu - + self.config.use_last_model_value * self.config.reanalyse_on_gpu - ) - if 1 < num_gpus_per_worker: - num_gpus_per_worker = math.floor(num_gpus_per_worker) + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) else: - num_gpus_per_worker = 0 + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) - # Initialize workers - self.training_worker = trainer.Trainer.options( - num_cpus=0, - num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, - ).remote(self.checkpoint, self.config) + return action - self.shared_storage_worker = shared_storage.SharedStorage.remote( - self.checkpoint, - self.config, - ) - self.shared_storage_worker.set_info.remote("terminate", False) +class PlayBuffer: + """ + Class which run in a dedicated thread to store played games and generate batch. + """ - self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( - self.checkpoint, self.replay_buffer, self.config + def __init__(self, initial_checkpoint, initial_buffer, config): + self.config = config + self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{} + self.num_played_games = initial_checkpoint["num_played_games"] + self.num_played_steps = initial_checkpoint["num_played_steps"] + self.total_samples = sum( + [len(game_history.root_values) for game_history in self.buffer.values()] ) + if self.total_samples != 0: + print( + f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n" + ) - #使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) - if self.config.use_last_model_value: - self.reanalyse_worker = replay_buffer.Reanalyse.options( - num_cpus=0, - num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, - ).remote(self.checkpoint, self.config) - - self.self_play_workers = [ - self_play.SelfPlay.options( - num_cpus=0, - num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, - ).remote( - self.checkpoint, - self.Game, - self.config, - self.config.seed + seed, - ) - for seed in range(self.config.num_workers) - ] + # Fix random generator seed + numpy.random.seed(self.config.seed) - # 这里调用continuous类的函数,主要是continuous函数会调用replay_buffer, + def save_game(self, game_history): + self.buffer[self.num_played_games] = game_history + self.num_played_games += 1 + self.num_played_steps += len(game_history.root_values) + self.total_samples += len(game_history.root_values) + + if self.config.replay_buffer_size < len(self.buffer): + del_id = self.num_played_games - len(self.buffer) + self.total_samples -= len(self.buffer[del_id].root_values) + del self.buffer[del_id] + + def get_buffer(self): + return self.buffer + + def get_batch(self): + ( + index_batch, + observation_batch, + action_batch, + reward_batch, + value_batch, + policy_batch, + gradient_scale_batch, + ) = ([], [], [], [], [], [], []) + weight_batch = None + + for game_id, game_history, game_prob in self.sample_n_games( + self.config.batch_size + ): + game_pos, pos_prob = self.sample_position(game_history) - # Launch workers - # 此处调用worker进行self play,把结果存在replay_buffer里 - [ - self_play_worker.continuous_self_play.remote( - self.shared_storage_worker, self.replay_buffer_worker + values, rewards, policies, actions = self.make_target( + game_history, game_pos ) - for self_play_worker in self.self_play_workers - ] - # 此处使用trainer,从replay buffer里按batch抽取数据,进行网络训练和更新 - self.training_worker.continuous_update_weights.remote( - self.replay_buffer_worker, self.shared_storage_worker - ) - # 使用最后一个模型提供更新鲜、稳定的n步值(参见论文附录Reanalyze) - if self.config.use_last_model_value: - self.reanalyse_worker.reanalyse.remote( - self.replay_buffer_worker, self.shared_storage_worker + index_batch.append([game_id, game_pos]) + observation_batch.append( + game_history.get_stacked_observations( + game_pos, + self.config.stacked_observations, + len(self.config.action_space), + ) ) - - if log_in_tensorboard: - self.logging_loop( - num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + action_batch.append(actions) + value_batch.append(values) + reward_batch.append(rewards) + policy_batch.append(policies) + gradient_scale_batch.append( + [ + min( + self.config.num_unroll_steps, + len(game_history.action_history) - game_pos, + ) + ] + * len(actions) ) - def logging_loop(self, num_gpus): - """ - Keep track of the training performance. - """ - # Launch the test worker to get performance metrics - self.test_worker = self_play.SelfPlay.options( - num_cpus=0, - num_gpus=num_gpus, - ).remote( - self.checkpoint, - self.Game, - self.config, - self.config.seed + self.config.num_workers, - ) - self.test_worker.continuous_self_play.remote( - self.shared_storage_worker, None, True - ) - - # Write everything in TensorBoard - writer = SummaryWriter(self.config.results_path) - - print( - "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1 + # value_batch: batch, num_unroll_steps+1 + # reward_batch: batch, num_unroll_steps+1 + # policy_batch: batch, num_unroll_steps+1, len(action_space) + # weight_batch: batch + # gradient_scale_batch: batch, num_unroll_steps+1 + return ( + index_batch, + ( + observation_batch, + action_batch, + value_batch, + reward_batch, + policy_batch, + weight_batch, + gradient_scale_batch, + ), ) - # Save hyperparameters to TensorBoard - hp_table = [ - f"| {key} | {value} |" for key, value in self.config.__dict__.items() - ] - writer.add_text( - "Hyperparameters", - "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), - ) - # Save model representation - writer.add_text( - "Model summary", - self.summary, - ) - # Loop for updating the training performance - counter = 0 - keys = [ - "total_reward", - "muzero_reward", - "opponent_reward", - "episode_length", - "mean_value", - "training_step", - "lr", - "total_loss", - "value_loss", - "reward_loss", - "policy_loss", - "num_played_games", - "num_played_steps", - "num_reanalysed_games", - ] - info = ray.get(self.shared_storage_worker.get_info.remote(keys)) - try: - while info["training_step"] < self.config.training_steps: - info = ray.get(self.shared_storage_worker.get_info.remote(keys)) - writer.add_scalar( - "1.Total_reward/1.Total_reward", - info["total_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/2.Mean_value", - info["mean_value"], - counter, - ) - writer.add_scalar( - "1.Total_reward/3.Episode_length", - info["episode_length"], - counter, - ) - writer.add_scalar( - "1.Total_reward/4.MuZero_reward", - info["muzero_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/5.Opponent_reward", - info["opponent_reward"], - counter, - ) - writer.add_scalar( - "2.Workers/1.Self_played_games", - info["num_played_games"], - counter, - ) - writer.add_scalar( - "2.Workers/2.Training_steps", info["training_step"], counter - ) - writer.add_scalar( - "2.Workers/3.Self_played_steps", info["num_played_steps"], counter - ) - writer.add_scalar( - "2.Workers/4.Reanalysed_games", - info["num_reanalysed_games"], - counter, - ) - writer.add_scalar( - "2.Workers/5.Training_steps_per_self_played_step_ratio", - info["training_step"] / max(1, info["num_played_steps"]), - counter, - ) - writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) - writer.add_scalar( - "3.Loss/1.Total_weighted_loss", info["total_loss"], counter - ) - writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) - writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) - writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) - print( - f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', - end="\r", - ) - counter += 1 - time.sleep(0.5) - except KeyboardInterrupt: - pass - - self.terminate_workers() - - if self.config.save_model: - # Persist replay buffer to disk - path = self.config.results_path / "replay_buffer.pkl" - print(f"\n\nPersisting replay buffer games to disk at {path}") - pickle.dump( - { - "buffer": self.replay_buffer, - "num_played_games": self.checkpoint["num_played_games"], - "num_played_steps": self.checkpoint["num_played_steps"], - "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], - }, - open(path, "wb"), - ) - - def terminate_workers(self): + def sample_game(self, force_uniform=True): #将force_uniform 设置为True,强制安装平均分布选取 """ - Softly terminate the running tasks and garbage collect the workers. + Sample game from buffer either uniformly or according to some priority. + See paper appendix Training. """ - if self.shared_storage_worker: - self.shared_storage_worker.set_info.remote("terminate", True) - self.checkpoint = ray.get( - self.shared_storage_worker.get_checkpoint.remote() - ) - if self.replay_buffer_worker: - self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + game_prob = None - print("\nShutting down workers...") + game_index = numpy.random.choice(len(self.buffer)) + game_id = self.num_played_games - len(self.buffer) + game_index - self.self_play_workers = None - self.test_worker = None - self.training_worker = None - self.reanalyse_worker = None - self.replay_buffer_worker = None - self.shared_storage_worker = None + return game_id, self.buffer[game_id], game_prob - def test( - self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 - ): + def sample_n_games(self, n_games): + selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) + game_prob_dict = {} + ret = [ + (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) + for game_id in selected_games + ] + return ret + + def sample_position(self, game_history): """ - Test the model in a dedicated thread. + Sample position from game either uniformly or according to some priority. + See paper appendix Training. + """ + position_prob = None - Args: - render (bool): To display or not the environment. Defaults to True. + position_index = numpy.random.choice(len(game_history.root_values)) - opponent (str): "self" for self-play, "human" for playing against MuZero and "random" - for a random agent, None will use the opponent in the config. Defaults to None. + return position_index, position_prob - muzero_player (int): Player number of MuZero in case of multiplayer - games, None let MuZero play all players turn by turn, None will use muzero_player in - the config. Defaults to None. + def update_game_history(self, game_id, game_history): + # The element could have been removed since its selection and update + # if next(iter(self.buffer)) <= game_id: + # self.buffer[game_id] = game_history - num_tests (int): Number of games to average. Defaults to 1. + self.buffer[game_id] = game_history - num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. - """ - opponent = opponent if opponent else self.config.opponent - muzero_player = muzero_player if muzero_player else self.config.muzero_player - self_play_worker = self_play.SelfPlay.options( - num_cpus=0, - num_gpus=num_gpus, - ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) - results = [] - for i in range(num_tests): - print(f"Testing {i+1}/{num_tests}") - results.append( - ray.get( - self_play_worker.play_game.remote( - 0, - 0, - render, - opponent, - muzero_player, - ) - ) + def compute_target_value(self, game_history, index): + # The value target is the discounted root value of the search tree td_steps into the + # future, plus the discounted sum of all rewards until then. + bootstrap_index = index + self.config.td_steps + if bootstrap_index < len(game_history.root_values): + root_values = ( + game_history.root_values + if game_history.reanalysed_predicted_root_values is None + else game_history.reanalysed_predicted_root_values + ) + last_step_value = ( + root_values[bootstrap_index] + if game_history.to_play_history[bootstrap_index] + == game_history.to_play_history[index] + else -root_values[bootstrap_index] ) - self_play_worker.close_game.remote() - if len(self.config.players) == 1: - result = numpy.mean([sum(history.reward_history) for history in results]) + value = last_step_value * self.config.discount**self.config.td_steps else: - result = numpy.mean( - [ - sum( - reward - for i, reward in enumerate(history.reward_history) - if history.to_play_history[i - 1] == muzero_player - ) - for history in results - ] - ) - return result + value = 0 - def load_model(self, checkpoint_path=None, replay_buffer_path=None): - """ - Load a model and/or a saved replay buffer. + for i, reward in enumerate( + game_history.reward_history[index + 1 : bootstrap_index + 1] + ): + # The value is oriented from the perspective of the current player + value += ( + reward + if game_history.to_play_history[index] + == game_history.to_play_history[index + i] + else -reward + ) * self.config.discount**i - Args: - checkpoint_path (str): Path to model.checkpoint or model.weights. + return value - replay_buffer_path (str): Path to replay_buffer.pkl + def make_target(self, game_history, state_index): """ - # Load checkpoint - if checkpoint_path: - checkpoint_path = pathlib.Path(checkpoint_path) - self.checkpoint = torch.load(checkpoint_path) - print(f"\nUsing checkpoint from {checkpoint_path}") - - # Load replay buffer - if replay_buffer_path: - replay_buffer_path = pathlib.Path(replay_buffer_path) - with open(replay_buffer_path, "rb") as f: - replay_buffer_infos = pickle.load(f) - self.replay_buffer = replay_buffer_infos["buffer"] - self.checkpoint["num_played_steps"] = replay_buffer_infos[ - "num_played_steps" - ] - self.checkpoint["num_played_games"] = replay_buffer_infos[ - "num_played_games" - ] - self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ - "num_reanalysed_games" - ] - - print(f"\nInitializing replay buffer with {replay_buffer_path}") - else: - print(f"Using empty buffer.") - self.replay_buffer = {} - self.checkpoint["training_step"] = 0 - self.checkpoint["num_played_steps"] = 0 - self.checkpoint["num_played_games"] = 0 - self.checkpoint["num_reanalysed_games"] = 0 - - def diagnose_model(self, horizon): + Generate targets for every unroll steps. """ - Play a game only with the learned model then play the same trajectory in the real - environment and display information. + target_values, target_rewards, target_policies, actions = [], [], [], [] + for current_index in range( + state_index, state_index + self.config.num_unroll_steps + 1 + ): + value = self.compute_target_value(game_history, current_index) + + if current_index < len(game_history.root_values): + target_values.append(value) + target_rewards.append(game_history.reward_history[current_index]) + target_policies.append(game_history.child_visits[current_index]) + actions.append(game_history.action_history[current_index]) + elif current_index == len(game_history.root_values): + target_values.append(0) + target_rewards.append(game_history.reward_history[current_index]) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(game_history.action_history[current_index]) + else: + # States past the end of games are treated as absorbing states + target_values.append(0) + target_rewards.append(0) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(numpy.random.choice(self.config.action_space)) - Args: - horizon (int): Number of timesteps for which we collect information. - """ - game = self.Game(self.config.seed) - obs = game.reset() - dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) - dm.compare_virtual_with_real_trajectories(obs, game, horizon) - input("Press enter to close all plots") - dm.close_all() - - -@ray.remote(num_cpus=0, num_gpus=0) -class CPUActor: - # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU - def __init__(self): - pass + return target_values, target_rewards, target_policies, actions - def get_initial_weights(self, config): - model = models.MuZeroNetwork(config) - weigths = model.get_weights() - summary = str(model).replace("\n", " \n\n") - return weigths, summary +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + def __init__(self, initial_checkpoint, config): + self.config = config -def hyperparameter_search( - game_name, parametrization, budget, parallel_experiments, num_tests -): - """ - Search for hyperparameters by launching parallel experiments. + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) - Args: - game_name (str): Name of the game module, it should match the name of a .py file - in the "./games" directory. + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() - parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + self.training_step = initial_checkpoint["training_step"] - budget (int): Number of experiments to launch in total. + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") - parallel_experiments (int): Number of experiments to launch in parallel. + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) - num_tests (int): Number of games to average for evaluating an experiment. - """ - optimizer = nevergrad.optimizers.OnePlusOne( - parametrization=parametrization, budget=budget - ) + # if initial_checkpoint["optimizer_state"] is not None: + # print("Loading optimizer...\n") + # self.optimizer.load_state_dict( + # copy.deepcopy(initial_checkpoint["optimizer_state"]) + # ) + + # # update weights 与 continuous update weights 的区别 + # # 1. update weights 是实际计算更新network的权重 + # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + # next_batch = play_buffer.get_batch() + # # Training loop + # while self.training_step < self.config.training_steps and not terminate: + # index_batch, batch = next_batch + # next_batch = play_buffer.get_batch() + # self.update_lr() + # ( + # priorities, + # total_loss, + # value_loss, + # reward_loss, + # policy_loss, + # ) = self.update_weights(batch) + + def update_weights(self, batch): + """ + Perform one training step. + """ - running_experiments = [] - best_training = None - try: - # Launch initial experiments - for i in range(parallel_experiments): - if 0 < budget: - param = optimizer.ask() - print(f"Launching new experiment: {param.value}") - muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments) - muzero.param = param - muzero.train(False) - running_experiments.append(muzero) - budget -= 1 - - while 0 < budget or any(running_experiments): - for i, experiment in enumerate(running_experiments): - if experiment and experiment.config.training_steps <= ray.get( - experiment.shared_storage_worker.get_info.remote("training_step") - ): - experiment.terminate_workers() - result = experiment.test(False, num_tests=num_tests) - if not best_training or best_training["result"] < result: - best_training = { - "result": result, - "config": experiment.config, - "checkpoint": experiment.checkpoint, - } - print(f"Parameters: {experiment.param.value}") - print(f"Result: {result}") - optimizer.tell(experiment.param, -result) - - if 0 < budget: - param = optimizer.ask() - print(f"Launching new experiment: {param.value}") - muzero = MuZero_Without_Replay_Buffer(game_name, param.value, parallel_experiments) - muzero.param = param - muzero.train(False) - running_experiments[i] = muzero - budget -= 1 - else: - running_experiments[i] = None + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 - except KeyboardInterrupt: - for experiment in running_experiments: - if isinstance(experiment, MuZero_Without_Replay_Buffer): - experiment.terminate_workers() - - recommendation = optimizer.provide_recommendation() - print("Best hyperparameters:") - print(recommendation.value) - if best_training: - # Save best training weights (but it's not the recommended weights) - best_training["config"].results_path.mkdir(parents=True, exist_ok=True) - torch.save( - best_training["checkpoint"], - best_training["config"].results_path / "model.checkpoint", + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], ) - # Save the recommended hyperparameters - text_file = open( - best_training["config"].results_path / "best_parameters.txt", - "w", + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha ) - text_file.write(str(recommendation.value)) - text_file.close() - return recommendation.value + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) -def load_model_menu(muzero, game_name): - # Configure running options - options = ["Specify paths manually"] + sorted( - (pathlib.Path("results") / game_name).glob("*/") - ) - options.reverse() - print() - for i in range(len(options)): - print(f"{i}. {options[i]}") - - choice = input("Enter a number to choose a model to load: ") - valid_inputs = [str(i) for i in range(len(options))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - choice = int(choice) - - if choice == (len(options) - 1): - # manual path option - checkpoint_path = input( - "Enter a path to the model.checkpoint, or ENTER if none: " + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), ) - while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): - checkpoint_path = input("Invalid checkpoint path. Try again: ") - replay_buffer_path = input( - "Enter a path to the replay_buffer.pkl, or ENTER if none: " + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps ) - while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): - replay_buffer_path = input("Invalid replay buffer path. Try again: ") - else: - checkpoint_path = options[choice] / "model.checkpoint" - replay_buffer_path = options[choice] / "replay_buffer.pkl" - - muzero.load_model( - checkpoint_path=checkpoint_path, - replay_buffer_path=replay_buffer_path, + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1) + + return value_loss, reward_loss, policy_loss + + +def logging_loop(config, checkpoint, writer, training_steps): + # writer = SummaryWriter(config.results_path) + + # print( + # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + # ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), ) + # # Save model representation + # writer.add_text( + # "Model summary", + # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 + # ) + # Loop for updating the training performance + counter = training_steps + try: + if True: + # while checkpoint["training_step"] < config.training_steps: + writer.add_scalar( + "1.Total_reward/1.Total_reward", + checkpoint["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + checkpoint["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + checkpoint["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + checkpoint["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + checkpoint["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + checkpoint["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", checkpoint["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + checkpoint["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter) + print( + f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + # time.sleep(0.5) + except KeyboardInterrupt: + pass -if __name__ == "__main__": - if len(sys.argv) == 2: - # Train directly with: python muzero.py cartpole - muzero = MuZero_Without_Replay_Buffer(sys.argv[1]) - muzero.train() - elif len(sys.argv) == 3: - # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' - config = json.loads(sys.argv[2]) - muzero = MuZero_Without_Replay_Buffer(sys.argv[1], config) - muzero.train() - else: - print("\nWelcome to MuZero! Here's a list of games:") - # Let user pick a game - games = [ - filename.stem - for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) - if filename.name != "abstract_game.py" - ] - for i in range(len(games)): - print(f"{i}. {games[i]}") - choice = input("Enter a number to choose the game: ") - valid_inputs = [str(i) for i in range(len(games))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - - # Initialize MuZero - choice = int(choice) - game_name = games[choice] - muzero = MuZero_Without_Replay_Buffer(game_name) - - while True: - # Configure running options - options = [ - "Train", - "Load pretrained model", - "Diagnose model", - "Render some self play games", - "Play against MuZero", - "Test the game manually", - "Hyperparameter search", - "Exit", - ] - print() - for i in range(len(options)): - print(f"{i}. {options[i]}") - - choice = input("Enter a number to choose an action: ") - valid_inputs = [str(i) for i in range(len(options))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - choice = int(choice) - if choice == 0: - start_time = time.time() - muzero.train() - end_time = time.time() - print("耗时: {:.2f}秒".format(end_time - start_time)) - elif choice == 1: - load_model_menu(muzero, game_name) - elif choice == 2: - muzero.diagnose_model(30) - elif choice == 3: - muzero.test(render=True, opponent="self", muzero_player=None) - elif choice == 4: - muzero.test(render=True, opponent="human", muzero_player=0) - elif choice == 5: - env = muzero.Game() - env.reset() - env.render() - - done = False - while not done: - action = env.human_to_action() - observation, reward, done = env.step(action) - print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") - env.render() - elif choice == 6: - # Define here the parameters to tune - # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html - muzero.terminate_workers() - del muzero - budget = 20 - parallel_experiments = 2 - lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) - discount = nevergrad.p.Log(lower=0.95, upper=0.9999) - parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) - best_hyperparameters = hyperparameter_search( - game_name, parametrization, budget, parallel_experiments, 20 + # if config.save_model: + # # Persist replay buffer to disk + # path = config.results_path / "replay_buffer.pkl" + # print(f"\n\nPersisting replay buffer games to disk at {path}") + # pickle.dump( + # { + # "buffer": buffer, + # "num_played_games": checkpoint["num_played_games"], + # "num_played_steps": checkpoint["num_played_steps"], + # "num_reanalysed_games": checkpoint["num_reanalysed_games"], + # }, + # open(path, "wb"), + # ) + +def update_gameplay_checkpoint(config, checkpoint, game_history): + checkpoint["episode_length"] = len(game_history.action_history) - 1 + checkpoint["total_reward"] = sum(game_history.reward_history) + checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) + + if 1 < len(config.players): + checkpoint["muzero_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == config.muzero_player + ) + checkpoint["opponent_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != config.muzero_player ) - muzero = MuZero_Without_Replay_Buffer(game_name, best_hyperparameters) - else: - break - print("\nDone") - ray.shutdown() +def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中 + if not path: + path = config.results_path / "model.checkpoint" + + torch.save(checkpoint, path) + +def train(log_in_tensorboard=True): + config = MuZeroConfig() + config.results_path /= "muzero_without_rb" + + if log_in_tensorboard or config.save_model: + config.results_path.mkdir(parents=True, exist_ok=True) + + checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + + trainer = Trainer(checkpoint, config) + selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed) + buffer = {} + play_buffer = PlayBuffer(checkpoint, buffer, config) + + step = 1 # 间隔,即每次模拟后训练多少次 + max_steps = int(config.training_steps/step) + + writer = SummaryWriter(config.results_path) + + for episode in range(max_steps): + game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + # print(game_history.reward_history) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + play_buffer.update_game_history(game_id, game_history) + update_gameplay_checkpoint(config, checkpoint, game_history) + + for i in range(step): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = trainer.update_weights(batch) + + + training_step = episode * step + i + if training_step % config.checkpoint_interval == 0: + checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) + checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) + + if config.save_model: + save_checkpoint(config, checkpoint) + checkpoint["training_step"] = training_step + checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] + checkpoint["total_loss"] = total_loss + checkpoint["value_loss"] = value_loss + checkpoint["reward_loss"] = reward_loss + checkpoint["policy_loss"] = policy_loss + + # print(training_step) + # if training_step % 500 == 0: + # if training_step % config.checkpoint_interval == 0: + # # print(training_step) + # logging_loop(config, checkpoint, writer) + + logging_loop(config, checkpoint, writer, training_step) + + + writer.close() + + selfplay.close_game() + +if __name__ == "__main__": + start_time = time.time() + train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) \ No newline at end of file diff --git a/muzero_without_replay_buffer2.py b/muzero_without_replay_buffer2.py new file mode 100644 index 00000000..ebbb147f --- /dev/null +++ b/muzero_without_replay_buffer2.py @@ -0,0 +1,417 @@ +import pathlib +import importlib +import ray + +import numpy +import torch +from torch.utils.tensorboard import SummaryWriter +import pickle + +import math +import time +import copy +import nevergrad +import sys +import json + +from simplifiedMuZero.without_rb.game_play import GamePlay +from simplifiedMuZero.without_rb.play_buffer import PlayBuffer +from simplifiedMuZero.without_rb.trainer import Trainer +from muzero import load_model_menu, hyperparameter_search + +import models + + +class CPUActorWithClass: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config, model_cls): + model = model_cls(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + +class MuZeroWithoutRB: + def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + if save_path_ex: + config.results_path /= save_path_ex + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + self.model_cls = model_cls + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActorWithClass() + cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls) + self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights)) + + + def logging_loop(self, writer, training_steps): + # writer = SummaryWriter(config.results_path) + + # print( + # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + # ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # # Save model representation + # writer.add_text( + # "Model summary", + # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 + # ) + # Loop for updating the training performance + counter = training_steps + + try: + if True: + # while checkpoint["training_step"] < config.training_steps: + writer.add_scalar( + "1.Total_reward/1.Total_reward", + self.checkpoint["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + self.checkpoint["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + self.checkpoint["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + self.checkpoint["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + self.checkpoint["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + self.checkpoint["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + self.checkpoint["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter) + print( + f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + # time.sleep(0.5) + except KeyboardInterrupt: + pass + + # if config.save_model: + # # Persist replay buffer to disk + # path = config.results_path / "replay_buffer.pkl" + # print(f"\n\nPersisting replay buffer games to disk at {path}") + # pickle.dump( + # { + # "buffer": buffer, + # "num_played_games": checkpoint["num_played_games"], + # "num_played_steps": checkpoint["num_played_steps"], + # "num_reanalysed_games": checkpoint["num_reanalysed_games"], + # }, + # open(path, "wb"), + # ) + + def update_gameplay_checkpoint(self, game_history): + self.checkpoint["episode_length"] = len(game_history.action_history) - 1 + self.checkpoint["total_reward"] = sum(game_history.reward_history) + self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) + + if 1 < len(self.config.players): + self.checkpoint["muzero_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ) + self.checkpoint["opponent_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ) + + def save_checkpoint(self, path=None): #将模型存储在文件中 + if not path: + path = self.config.results_path / "model.checkpoint" + + torch.save(self.checkpoint, path) + + def train(self, log_in_tensorboard=True): + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + + trainer = Trainer(models.MuZeroNetwork, self.checkpoint, self.config) + game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed) + buffer = {} + play_buffer = PlayBuffer(self.checkpoint, buffer, self.config) + + step = 1 # 间隔,即每次模拟后训练多少次 + max_steps = int(self.config.training_steps/step) + # max_steps = 2000 + + writer = SummaryWriter(self.config.results_path) + + for episode in range(max_steps): + game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + # print(game_history.reward_history) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + play_buffer.update_game_history(game_id, game_history) + self.update_gameplay_checkpoint( game_history) + + for i in range(step): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = trainer.update_weights(batch) + + + training_step = episode * step + i + if training_step % self.config.checkpoint_interval == 0: + self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) + self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) + + if self.config.save_model: + self.save_checkpoint() + self.checkpoint["training_step"] = training_step + self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] + self.checkpoint["total_loss"] = total_loss + self.checkpoint["value_loss"] = value_loss + self.checkpoint["reward_loss"] = reward_loss + self.checkpoint["policy_loss"] = policy_loss + + # print(training_step) + # if training_step % 500 == 0: + # if training_step % config.checkpoint_interval == 0: + # # print(training_step) + # logging_loop(config, checkpoint, writer) + + self.logging_loop(writer, training_step) + + + writer.close() + + game_play.close_game() + +if __name__ == "__main__": + # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") + # start_time = time.time() + # muzero.train() + # end_time = time.time() + # print("耗时: {:.2f}秒".format(end_time - start_time)) + model_cls = models.MuZeroNetwork + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZeroWithoutRB(sys.argv[1], model_cls=model_cls) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZeroWithoutRB(sys.argv[1], config, model_cls=model_cls) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZeroWithoutRB(game_name, model_cls=model_cls) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZeroWithoutRB(game_name, best_hyperparameters , model_cls=model_cls) + else: + break + print("\nDone") diff --git a/muzero_without_replay_buffer_tictactoe.py b/muzero_without_replay_buffer_tictactoe.py new file mode 100644 index 00000000..f64413ab --- /dev/null +++ b/muzero_without_replay_buffer_tictactoe.py @@ -0,0 +1,242 @@ +from self_play import MCTS, GameHistory +from games.tictactoe import MuZeroConfig, Game +# from games.tictactoe import MuZeroConfig, Game +import models + +import numpy +import torch +from torch.utils.tensorboard import SummaryWriter +import pickle + +import math +import time +import copy + +from simplifiedMuZero.without_rb.game_play import GamePlay +from simplifiedMuZero.without_rb.play_buffer import PlayBuffer +from simplifiedMuZero.without_rb.trainer import Trainer + +def logging_loop(config, checkpoint, writer, training_steps): + # writer = SummaryWriter(config.results_path) + + # print( + # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + # ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # # Save model representation + # writer.add_text( + # "Model summary", + # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 + # ) + # Loop for updating the training performance + counter = training_steps + + try: + if True: + # while checkpoint["training_step"] < config.training_steps: + writer.add_scalar( + "1.Total_reward/1.Total_reward", + checkpoint["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + checkpoint["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + checkpoint["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + checkpoint["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + checkpoint["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + checkpoint["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", checkpoint["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + checkpoint["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter) + print( + f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + # time.sleep(0.5) + except KeyboardInterrupt: + pass + + # if config.save_model: + # # Persist replay buffer to disk + # path = config.results_path / "replay_buffer.pkl" + # print(f"\n\nPersisting replay buffer games to disk at {path}") + # pickle.dump( + # { + # "buffer": buffer, + # "num_played_games": checkpoint["num_played_games"], + # "num_played_steps": checkpoint["num_played_steps"], + # "num_reanalysed_games": checkpoint["num_reanalysed_games"], + # }, + # open(path, "wb"), + # ) + +def update_gameplay_checkpoint(config, checkpoint, game_history): + checkpoint["episode_length"] = len(game_history.action_history) - 1 + checkpoint["total_reward"] = sum(game_history.reward_history) + checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) + + if 1 < len(config.players): + checkpoint["muzero_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == config.muzero_player + ) + checkpoint["opponent_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != config.muzero_player + ) + +def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中 + if not path: + path = config.results_path / "model.checkpoint" + + torch.save(checkpoint, path) + +def train(log_in_tensorboard=True): + config = MuZeroConfig() + config.results_path /= "muzero_without_rb" + + if log_in_tensorboard or config.save_model: + config.results_path.mkdir(parents=True, exist_ok=True) + + checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + + trainer = Trainer(models.MuZeroNetwork, checkpoint, config) + selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed) + buffer = {} + play_buffer = PlayBuffer(checkpoint, buffer, config) + + step = 1 # 间隔,即每次模拟后训练多少次 + max_steps = int(config.training_steps/step) + # max_steps = 2000 + + writer = SummaryWriter(config.results_path) + + for episode in range(max_steps): + game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + # print(game_history.reward_history) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + play_buffer.update_game_history(game_id, game_history) + update_gameplay_checkpoint(config, checkpoint, game_history) + + for i in range(step): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = trainer.update_weights(batch) + + + training_step = episode * step + i + if training_step % config.checkpoint_interval == 0: + checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) + checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) + + if config.save_model: + save_checkpoint(config, checkpoint) + checkpoint["training_step"] = training_step + checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] + checkpoint["total_loss"] = total_loss + checkpoint["value_loss"] = value_loss + checkpoint["reward_loss"] = reward_loss + checkpoint["policy_loss"] = policy_loss + + # print(training_step) + # if training_step % 500 == 0: + # if training_step % config.checkpoint_interval == 0: + # # print(training_step) + # logging_loop(config, checkpoint, writer) + + logging_loop(config, checkpoint, writer, training_step) + + + writer.close() + + selfplay.close_game() + +if __name__ == "__main__": + start_time = time.time() + train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) \ No newline at end of file diff --git a/simplifiedMuZero/net2/trainer_2net.py b/simplifiedMuZero/net2/trainer_2net.py index 19888cf2..567b8f9a 100644 --- a/simplifiedMuZero/net2/trainer_2net.py +++ b/simplifiedMuZero/net2/trainer_2net.py @@ -69,8 +69,6 @@ def continuous_update_weights(self, replay_buffer, shared_storage): shared_storage.get_info.remote("terminate") ): index_batch, batch = ray.get(next_batch) - print("train batch size is : ", batch[0].shape) - print("train index_batch size is : ", index_batch.shape) next_batch = replay_buffer.get_batch.remote() self.update_lr() ( diff --git a/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py b/simplifiedMuZero/search_policy/self_play_uniform_search.py similarity index 91% rename from simplifiedMuZero/without_rb/self_play_without_replay_buffer.py rename to simplifiedMuZero/search_policy/self_play_uniform_search.py index 7e0d6512..314249f0 100644 --- a/simplifiedMuZero/without_rb/self_play_without_replay_buffer.py +++ b/simplifiedMuZero/search_policy/self_play_uniform_search.py @@ -2,13 +2,13 @@ import time import numpy -# import ray +import ray import torch -import simplifiedMuZero.without_rb.models_without_replay_buffer as models +import models -# @ray.remote +@ray.remote class SelfPlay: """ Class which run in a dedicated thread to play games and save them to the replay-buffer. @@ -107,9 +107,6 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): self.close_game() - # play game 与continuous self play 的区别: - # 1. play game 是实际运行游戏,游戏的结果存在game history里,不向replay buffer里写 - # 2. continuous self play 调用play game,把获取到的game history 异步写进 replay buffer #play game 运行 # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 # 运行步骤: @@ -131,7 +128,7 @@ def play_game( game_history.action_history.append(0) game_history.observation_history.append(observation) # 添加reset之后的observation game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) + game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的 done = False @@ -157,7 +154,7 @@ def play_game( # 一下的if-else部分主要是为了选择一个动作 # Choose the action if opponent == "self" or muzero_player == self.game.to_play(): - root, mcts_info = MCTS(self.config).run( + root, mcts_info = UniformSearch(self.config).run( self.model, stacked_observations, self.game.legal_actions(), @@ -206,7 +203,7 @@ def select_opponent_action(self, opponent, stacked_observations): Select opponent action for evaluating MuZero level. """ if opponent == "human": - root, mcts_info = MCTS(self.config).run( + root, mcts_info = UniformSearch(self.config).run( self.model, stacked_observations, self.game.legal_actions(), @@ -267,7 +264,7 @@ def select_action(node, temperature): # Game independent -class MCTS: +class UniformSearch: """ Core Monte Carlo Tree Search algorithm. To decide on an action, we run N simulations, always starting at the root of @@ -411,46 +408,47 @@ def select_child(self, node, min_max_stats): """ Select the child with the highest UCB score. """ - max_ucb = max( - self.ucb_score(node, child, min_max_stats) - for action, child in node.children.items() - ) - action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) - [ - action - for action, child in node.children.items() - if self.ucb_score(node, child, min_max_stats) == max_ucb - ] - ) + # max_ucb = max( + # self.ucb_score(node, child, min_max_stats) + # for action, child in node.children.items() + # ) + # action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) + # [ + # action + # for action, child in node.children.items() + # if self.ucb_score(node, child, min_max_stats) == max_ucb + # ] + # ) + action = numpy.random.choice([action for action,child in node.children.items()]) return action, node.children[action] - def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 - """ - The score for a node is based on its value, plus an exploration bonus based on the prior. - """ - pb_c = ( - math.log( - (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 - ) - + self.config.pb_c_init - ) - pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) - - prior_score = pb_c * child.prior # prior 之前的p_value - # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) - # prior_score = pbc * prior - - if child.visit_count > 0: - # Mean value Q - value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 - child.reward - + self.config.discount # 衰减系数, 之后乘以子节点的值 - * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 - ) - else: - value_score = 0 - - return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 + # def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 + # """ + # The score for a node is based on its value, plus an exploration bonus based on the prior. + # """ + # pb_c = ( + # math.log( + # (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 + # ) + # + self.config.pb_c_init + # ) + # pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) + # + # prior_score = pb_c * child.prior # prior 之前的p_value + # # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # # prior_score = pbc * prior + # + # if child.visit_count > 0: + # # Mean value Q + # value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 + # child.reward + # + self.config.discount # 衰减系数, 之后乘以子节点的值 + # * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 + # ) + # else: + # value_score = 0 + # + # return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 # 反向传播算法 # 对路径上的所有访问次数+1,value值加reward diff --git a/simplifiedMuZero/without_rb/game_play.py b/simplifiedMuZero/without_rb/game_play.py new file mode 100644 index 00000000..b0304d64 --- /dev/null +++ b/simplifiedMuZero/without_rb/game_play.py @@ -0,0 +1,182 @@ +import numpy +import torch +from self_play import GameHistory, MCTS +class GamePlay: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, model, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + # self.model = models.MuZeroNetwork(self.config) + # self.model.set_weights(initial_checkpoint["weights"]) + self.model = model + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + self.trained_steps = initial_checkpoint["training_step"] + self.terminate = False + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + game_id = None + + if render: + self.game.render() + + game_id = self.game.to_play() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = self.select_action( + root, + temperature + if not temperature_threshold + or len(game_history.action_history) < temperature_threshold + else 0, + ) # 根据temperature选择动作 + + if render: + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print( + f"Root value for player {self.game.to_play()}: {root.value():.2f}" + ) + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_id, game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + True, + ) + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") + print( + f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" + ) + return self.game.human_to_action(), root + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) + + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action \ No newline at end of file diff --git a/simplifiedMuZero/without_rb/models_without_replay_buffer.py b/simplifiedMuZero/without_rb/models_without_replay_buffer.py deleted file mode 100644 index d4b8bc2f..00000000 --- a/simplifiedMuZero/without_rb/models_without_replay_buffer.py +++ /dev/null @@ -1,696 +0,0 @@ -import math -from abc import ABC, abstractmethod - -import torch - - -class MuZeroNetwork: - def __new__(cls, config): - if config.network == "fullyconnected": - return MuZeroFullyConnectedNetwork( - config.observation_shape, - config.stacked_observations, - len(config.action_space), - config.encoding_size, - config.fc_reward_layers, - config.fc_value_layers, - config.fc_policy_layers, - config.fc_representation_layers, - config.fc_dynamics_layers, - config.support_size, - ) - elif config.network == "resnet": - return MuZeroResidualNetwork( - config.observation_shape, - config.stacked_observations, - len(config.action_space), - config.blocks, - config.channels, - config.reduced_channels_reward, - config.reduced_channels_value, - config.reduced_channels_policy, - config.resnet_fc_reward_layers, - config.resnet_fc_value_layers, - config.resnet_fc_policy_layers, - config.support_size, - config.downsample, - ) - else: - raise NotImplementedError( - 'The network parameter should be "fullyconnected" or "resnet".' - ) - - -def dict_to_cpu(dictionary): - cpu_dict = {} - for key, value in dictionary.items(): - if isinstance(value, torch.Tensor): - cpu_dict[key] = value.cpu() - elif isinstance(value, dict): - cpu_dict[key] = dict_to_cpu(value) - else: - cpu_dict[key] = value - return cpu_dict - - -class AbstractNetwork(ABC, torch.nn.Module): - def __init__(self): - super().__init__() - pass - - @abstractmethod - def initial_inference(self, observation): - pass - - @abstractmethod - def recurrent_inference(self, encoded_state, action): - pass - - def get_weights(self): - return dict_to_cpu(self.state_dict()) - - def set_weights(self, weights): - self.load_state_dict(weights) - - -################################## -######## Fully Connected ######### - - -class MuZeroFullyConnectedNetwork(AbstractNetwork): - def __init__( - self, - observation_shape, - stacked_observations, - action_space_size, - encoding_size, - fc_reward_layers, - fc_value_layers, - fc_policy_layers, - fc_representation_layers, - fc_dynamics_layers, - support_size, - ): - super().__init__() - self.action_space_size = action_space_size - self.full_support_size = 2 * support_size + 1 - # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数 - - self.representation_network = torch.nn.DataParallel( - mlp( - observation_shape[0] - * observation_shape[1] - * observation_shape[2] - * (stacked_observations + 1) - + stacked_observations * observation_shape[1] * observation_shape[2], - fc_representation_layers, - encoding_size, - ) - ) - - #dynamics的输入是encoding_size+action_space_size - self.dynamics_encoded_state_network = torch.nn.DataParallel( - mlp( - encoding_size + self.action_space_size, - fc_dynamics_layers, - encoding_size, - ) - ) - self.dynamics_reward_network = torch.nn.DataParallel( - mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] - ) - - self.prediction_policy_network = torch.nn.DataParallel( - mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率 - ) - self.prediction_value_network = torch.nn.DataParallel( - mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] - ) - - def prediction(self, encoded_state): - policy_logits = self.prediction_policy_network(encoded_state) - value = self.prediction_value_network(encoded_state) - return policy_logits, value - - def representation(self, observation): - encoded_state = self.representation_network( - observation.view(observation.shape[0], -1) - ) - - # 正则化 - # Scale encoded state between [0, 1] (See appendix paper Training) - min_encoded_state = encoded_state.min(1, keepdim=True)[0] - max_encoded_state = encoded_state.max(1, keepdim=True)[0] - scale_encoded_state = max_encoded_state - min_encoded_state - scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN - encoded_state_normalized = ( - encoded_state - min_encoded_state - ) / scale_encoded_state - return encoded_state_normalized - - # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入,而representation不需要绑定action - def dynamics(self, encoded_state, action): - # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture) - action_one_hot = ( - torch.zeros((action.shape[0], self.action_space_size)) - .to(action.device) - .float() - ) - action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1 - x = torch.cat((encoded_state, action_one_hot), dim=1) - - next_encoded_state = self.dynamics_encoded_state_network(x) - - reward = self.dynamics_reward_network(next_encoded_state) - - # 正则化 - # Scale encoded state between [0, 1] (See paper appendix Training) - min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0] - max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0] - scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state - scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN - next_encoded_state_normalized = ( - next_encoded_state - min_next_encoded_state - ) / scale_next_encoded_state - - return next_encoded_state_normalized, reward - - def initial_inference(self, observation): - encoded_state = self.representation(observation) - policy_logits, value = self.prediction(encoded_state) - # reward equal to 0 for consistency 一致性奖励等于 0 - reward = torch.log( - ( - torch.zeros(1, self.full_support_size) - .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) - .repeat(len(observation), 1) - .to(observation.device) - ) - ) - # reward的样子为[[0,0,...,0,1,0,...,0,0],...]。即中间值为1,其余全为0,然后重复于observation行数相同的次数 - - return ( - value, - reward, - policy_logits, - encoded_state, - ) - - def recurrent_inference(self, encoded_state, action): - next_encoded_state, reward = self.dynamics(encoded_state, action) - policy_logits, value = self.prediction(next_encoded_state) - return value, reward, policy_logits, next_encoded_state - - -###### End Fully Connected ####### -################################## - - -################################## -############# ResNet ############# - - -def conv3x3(in_channels, out_channels, stride=1): - return torch.nn.Conv2d( - in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False - ) - - -# Residual block -class ResidualBlock(torch.nn.Module): - def __init__(self, num_channels, stride=1): - super().__init__() - self.conv1 = conv3x3(num_channels, num_channels, stride) - self.bn1 = torch.nn.BatchNorm2d(num_channels) - self.conv2 = conv3x3(num_channels, num_channels) - self.bn2 = torch.nn.BatchNorm2d(num_channels) - - def forward(self, x): - out = self.conv1(x) - out = self.bn1(out) - out = torch.nn.functional.relu(out) - out = self.conv2(out) - out = self.bn2(out) - out += x - out = torch.nn.functional.relu(out) - return out - - -# Downsample observations before representation network (See paper appendix Network Architecture) -class DownSample(torch.nn.Module): - def __init__(self, in_channels, out_channels): - super().__init__() - self.conv1 = torch.nn.Conv2d( - in_channels, - out_channels // 2, - kernel_size=3, - stride=2, - padding=1, - bias=False, - ) - self.resblocks1 = torch.nn.ModuleList( - [ResidualBlock(out_channels // 2) for _ in range(2)] - ) - self.conv2 = torch.nn.Conv2d( - out_channels // 2, - out_channels, - kernel_size=3, - stride=2, - padding=1, - bias=False, - ) - self.resblocks2 = torch.nn.ModuleList( - [ResidualBlock(out_channels) for _ in range(3)] - ) - self.pooling1 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1) - self.resblocks3 = torch.nn.ModuleList( - [ResidualBlock(out_channels) for _ in range(3)] - ) - self.pooling2 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1) - - def forward(self, x): - x = self.conv1(x) - for block in self.resblocks1: - x = block(x) - x = self.conv2(x) - for block in self.resblocks2: - x = block(x) - x = self.pooling1(x) - for block in self.resblocks3: - x = block(x) - x = self.pooling2(x) - return x - - -class DownsampleCNN(torch.nn.Module): - def __init__(self, in_channels, out_channels, h_w): - super().__init__() - mid_channels = (in_channels + out_channels) // 2 - self.features = torch.nn.Sequential( - torch.nn.Conv2d( - in_channels, mid_channels, kernel_size=h_w[0] * 2, stride=4, padding=2 - ), - torch.nn.ReLU(inplace=True), - torch.nn.MaxPool2d(kernel_size=3, stride=2), - torch.nn.Conv2d(mid_channels, out_channels, kernel_size=5, padding=2), - torch.nn.ReLU(inplace=True), - torch.nn.MaxPool2d(kernel_size=3, stride=2), - ) - self.avgpool = torch.nn.AdaptiveAvgPool2d(h_w) - - def forward(self, x): - x = self.features(x) - x = self.avgpool(x) - return x - - -class RepresentationNetwork(torch.nn.Module): - def __init__( - self, - observation_shape, - stacked_observations, - num_blocks, - num_channels, - downsample, - ): - super().__init__() - self.downsample = downsample - if self.downsample: - if self.downsample == "resnet": - self.downsample_net = DownSample( - observation_shape[0] * (stacked_observations + 1) - + stacked_observations, - num_channels, - ) - elif self.downsample == "CNN": - self.downsample_net = DownsampleCNN( - observation_shape[0] * (stacked_observations + 1) - + stacked_observations, - num_channels, - ( - math.ceil(observation_shape[1] / 16), - math.ceil(observation_shape[2] / 16), - ), - ) - else: - raise NotImplementedError('downsample should be "resnet" or "CNN".') - self.conv = conv3x3( - observation_shape[0] * (stacked_observations + 1) + stacked_observations, - num_channels, - ) - self.bn = torch.nn.BatchNorm2d(num_channels) - self.resblocks = torch.nn.ModuleList( - [ResidualBlock(num_channels) for _ in range(num_blocks)] - ) - - def forward(self, x): - if self.downsample: - x = self.downsample_net(x) - else: - x = self.conv(x) - x = self.bn(x) - x = torch.nn.functional.relu(x) - - for block in self.resblocks: - x = block(x) - return x - - -class DynamicsNetwork(torch.nn.Module): - def __init__( - self, - num_blocks, - num_channels, - reduced_channels_reward, - fc_reward_layers, - full_support_size, - block_output_size_reward, - ): - super().__init__() - self.conv = conv3x3(num_channels, num_channels - 1) - self.bn = torch.nn.BatchNorm2d(num_channels - 1) - self.resblocks = torch.nn.ModuleList( - [ResidualBlock(num_channels - 1) for _ in range(num_blocks)] - ) - - self.conv1x1_reward = torch.nn.Conv2d( - num_channels - 1, reduced_channels_reward, 1 - ) - self.block_output_size_reward = block_output_size_reward - self.fc = mlp( - self.block_output_size_reward, - fc_reward_layers, - full_support_size, - ) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = torch.nn.functional.relu(x) - for block in self.resblocks: - x = block(x) - state = x - x = self.conv1x1_reward(x) - x = x.view(-1, self.block_output_size_reward) - reward = self.fc(x) - return state, reward - - -class PredictionNetwork(torch.nn.Module): - def __init__( - self, - action_space_size, - num_blocks, - num_channels, - reduced_channels_value, - reduced_channels_policy, - fc_value_layers, - fc_policy_layers, - full_support_size, - block_output_size_value, - block_output_size_policy, - ): - super().__init__() - self.resblocks = torch.nn.ModuleList( - [ResidualBlock(num_channels) for _ in range(num_blocks)] - ) - - self.conv1x1_value = torch.nn.Conv2d(num_channels, reduced_channels_value, 1) - self.conv1x1_policy = torch.nn.Conv2d(num_channels, reduced_channels_policy, 1) - self.block_output_size_value = block_output_size_value - self.block_output_size_policy = block_output_size_policy - self.fc_value = mlp( - self.block_output_size_value, fc_value_layers, full_support_size - ) - self.fc_policy = mlp( - self.block_output_size_policy, - fc_policy_layers, - action_space_size, - ) - - def forward(self, x): - for block in self.resblocks: - x = block(x) - value = self.conv1x1_value(x) - policy = self.conv1x1_policy(x) - value = value.view(-1, self.block_output_size_value) - policy = policy.view(-1, self.block_output_size_policy) - value = self.fc_value(value) - policy = self.fc_policy(policy) - return policy, value - - -class MuZeroResidualNetwork(AbstractNetwork): - def __init__( - self, - observation_shape, - stacked_observations, - action_space_size, - num_blocks, - num_channels, - reduced_channels_reward, - reduced_channels_value, - reduced_channels_policy, - fc_reward_layers, - fc_value_layers, - fc_policy_layers, - support_size, - downsample, - ): - super().__init__() - self.action_space_size = action_space_size - self.full_support_size = 2 * support_size + 1 - block_output_size_reward = ( - ( - reduced_channels_reward - * math.ceil(observation_shape[1] / 16) - * math.ceil(observation_shape[2] / 16) - ) - if downsample - else (reduced_channels_reward * observation_shape[1] * observation_shape[2]) - ) - - block_output_size_value = ( - ( - reduced_channels_value - * math.ceil(observation_shape[1] / 16) - * math.ceil(observation_shape[2] / 16) - ) - if downsample - else (reduced_channels_value * observation_shape[1] * observation_shape[2]) - ) - - block_output_size_policy = ( - ( - reduced_channels_policy - * math.ceil(observation_shape[1] / 16) - * math.ceil(observation_shape[2] / 16) - ) - if downsample - else (reduced_channels_policy * observation_shape[1] * observation_shape[2]) - ) - - self.representation_network = torch.nn.DataParallel( - RepresentationNetwork( - observation_shape, - stacked_observations, - num_blocks, - num_channels, - downsample, - ) - ) - - self.dynamics_network = torch.nn.DataParallel( - DynamicsNetwork( - num_blocks, - num_channels + 1, - reduced_channels_reward, - fc_reward_layers, - self.full_support_size, - block_output_size_reward, - ) - ) - - self.prediction_network = torch.nn.DataParallel( - PredictionNetwork( - action_space_size, - num_blocks, - num_channels, - reduced_channels_value, - reduced_channels_policy, - fc_value_layers, - fc_policy_layers, - self.full_support_size, - block_output_size_value, - block_output_size_policy, - ) - ) - - def prediction(self, encoded_state): - policy, value = self.prediction_network(encoded_state) - return policy, value - - def representation(self, observation): - encoded_state = self.representation_network(observation) - - # Scale encoded state between [0, 1] (See appendix paper Training) - min_encoded_state = ( - encoded_state.view( - -1, - encoded_state.shape[1], - encoded_state.shape[2] * encoded_state.shape[3], - ) - .min(2, keepdim=True)[0] - .unsqueeze(-1) - ) - max_encoded_state = ( - encoded_state.view( - -1, - encoded_state.shape[1], - encoded_state.shape[2] * encoded_state.shape[3], - ) - .max(2, keepdim=True)[0] - .unsqueeze(-1) - ) - scale_encoded_state = max_encoded_state - min_encoded_state - scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 - encoded_state_normalized = ( - encoded_state - min_encoded_state - ) / scale_encoded_state - return encoded_state_normalized - - def dynamics(self, encoded_state, action): - # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture) - action_one_hot = ( - torch.ones( - ( - encoded_state.shape[0], - 1, - encoded_state.shape[2], - encoded_state.shape[3], - ) - ) - .to(action.device) - .float() - ) - action_one_hot = ( - action[:, :, None, None] * action_one_hot / self.action_space_size - ) - x = torch.cat((encoded_state, action_one_hot), dim=1) - next_encoded_state, reward = self.dynamics_network(x) - - # Scale encoded state between [0, 1] (See paper appendix Training) - min_next_encoded_state = ( - next_encoded_state.view( - -1, - next_encoded_state.shape[1], - next_encoded_state.shape[2] * next_encoded_state.shape[3], - ) - .min(2, keepdim=True)[0] - .unsqueeze(-1) - ) - max_next_encoded_state = ( - next_encoded_state.view( - -1, - next_encoded_state.shape[1], - next_encoded_state.shape[2] * next_encoded_state.shape[3], - ) - .max(2, keepdim=True)[0] - .unsqueeze(-1) - ) - scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state - scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 - next_encoded_state_normalized = ( - next_encoded_state - min_next_encoded_state - ) / scale_next_encoded_state - return next_encoded_state_normalized, reward - - def initial_inference(self, observation): - encoded_state = self.representation(observation) - policy_logits, value = self.prediction(encoded_state) - # reward equal to 0 for consistency - reward = torch.log( - ( - torch.zeros(1, self.full_support_size) - .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1 - .repeat(len(observation), 1) # 根据observation的长度复制,保证reward的维度于observation的一致,即之前的observation也赋值 - .to(observation.device) - ) - ) - return ( - value, - reward, - policy_logits, - encoded_state, - ) - - def recurrent_inference(self, encoded_state, action): - next_encoded_state, reward = self.dynamics(encoded_state, action) - policy_logits, value = self.prediction(next_encoded_state) - return value, reward, policy_logits, next_encoded_state - - -########### End ResNet ########### -################################## - - -def mlp( - input_size, - layer_sizes, - output_size, - output_activation=torch.nn.Identity, - activation=torch.nn.ELU, -): - sizes = [input_size] + layer_sizes + [output_size] - layers = [] - for i in range(len(sizes) - 1): - act = activation if i < len(sizes) - 2 else output_activation #激活函数,最后一层是output_activation,其余的都一样 - layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()] - return torch.nn.Sequential(*layers) - - -def support_to_scalar(logits, support_size): # logits 是 value的对数值,support_size是转换后的范围。 - """ - Transform a categorical representation to a scalar - See paper appendix Network Architecture - """ - # Decode to a scalar - probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1,softmax扩大大的,缩小下的,shape为[stacked_size, fully_support_size] - support = ( - torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1 - .expand(probabilities.shape) - .float() - .to(device=probabilities.device) - ) # shape 为【stacked_size, fully_support_size】, - x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1,fully_support_size】,因为dim=1,另外keep_dim=True,所有是【1,fully_support_size】而不是【fully_support_size] - - # Invert the scaling (defined in https://arxiv.org/abs/1805.11593) - x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1,大于0为1,0为0。主要是获取x的符号 - ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002 - ** 2 - - 1 - ) - return x - - -def scalar_to_support(x, support_size): - """ - Transform a scalar to a categorical representation with (2 * support_size + 1) categories - See paper appendix Network Architecture - """ - # Reduce the scale (defined in https://arxiv.org/abs/1805.11593) - x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x - - # Encode on a vector - x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围,使x的范围定为[-support_size, support_size] - floor = x.floor() # floor向下取整,类似的,ceil为向上取整 - prob = x - floor # 减去整数,保留小数部分(因为在support_to_scala部分是index位置乘上概率) - logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device) - logits.scatter_( - 2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1) - ) - indexes = floor + support_size + 1 - prob = prob.masked_fill_(2 * support_size < indexes, 0.0) - indexes = indexes.masked_fill_(2 * support_size < indexes, 0.0) - logits.scatter_(2, indexes.long().unsqueeze(-1), prob.unsqueeze(-1)) - return logits diff --git a/simplifiedMuZero/without_rb/play_buffer.py b/simplifiedMuZero/without_rb/play_buffer.py new file mode 100644 index 00000000..ad13a67f --- /dev/null +++ b/simplifiedMuZero/without_rb/play_buffer.py @@ -0,0 +1,214 @@ +import numpy +import torch +import copy +class PlayBuffer: + """ + Class which run in a dedicated thread to store played games and generate batch. + """ + + def __init__(self, initial_checkpoint, initial_buffer, config): + self.config = config + self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{} + self.num_played_games = initial_checkpoint["num_played_games"] + self.num_played_steps = initial_checkpoint["num_played_steps"] + self.total_samples = sum( + [len(game_history.root_values) for game_history in self.buffer.values()] + ) + if self.total_samples != 0: + print( + f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n" + ) + + # Fix random generator seed + numpy.random.seed(self.config.seed) + + def save_game(self, game_history): + self.buffer[self.num_played_games] = game_history + self.num_played_games += 1 + self.num_played_steps += len(game_history.root_values) + self.total_samples += len(game_history.root_values) + + if self.config.replay_buffer_size < len(self.buffer): + del_id = self.num_played_games - len(self.buffer) + self.total_samples -= len(self.buffer[del_id].root_values) + del self.buffer[del_id] + + def get_buffer(self): + return self.buffer + + def get_batch(self): + ( + index_batch, + observation_batch, + action_batch, + reward_batch, + value_batch, + policy_batch, + gradient_scale_batch, + ) = ([], [], [], [], [], [], []) + weight_batch = None + + for game_id, game_history, game_prob in self.sample_n_games( + self.config.batch_size + ): + game_pos, pos_prob = self.sample_position(game_history) + + values, rewards, policies, actions = self.make_target( + game_history, game_pos + ) + + index_batch.append([game_id, game_pos]) + observation_batch.append( + game_history.get_stacked_observations( + game_pos, + self.config.stacked_observations, + len(self.config.action_space), + ) + ) + action_batch.append(actions) + value_batch.append(values) + reward_batch.append(rewards) + policy_batch.append(policies) + gradient_scale_batch.append( + [ + min( + self.config.num_unroll_steps, + len(game_history.action_history) - game_pos, + ) + ] + * len(actions) + ) + + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1 + # value_batch: batch, num_unroll_steps+1 + # reward_batch: batch, num_unroll_steps+1 + # policy_batch: batch, num_unroll_steps+1, len(action_space) + # weight_batch: batch + # gradient_scale_batch: batch, num_unroll_steps+1 + return ( + index_batch, + ( + observation_batch, + action_batch, + value_batch, + reward_batch, + policy_batch, + weight_batch, + gradient_scale_batch, + ), + ) + + def sample_game(self, force_uniform=True): #将force_uniform 设置为True,强制安装平均分布选取 + """ + Sample game from buffer either uniformly or according to some priority. + See paper appendix Training. + """ + game_prob = None + + game_index = numpy.random.choice(len(self.buffer)) + game_id = self.num_played_games - len(self.buffer) + game_index + + return game_id, self.buffer[game_id], game_prob + + def sample_n_games(self, n_games): + selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) + game_prob_dict = {} + ret = [ + (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) + for game_id in selected_games + ] + return ret + + def sample_position(self, game_history): + """ + Sample position from game either uniformly or according to some priority. + See paper appendix Training. + """ + position_prob = None + + position_index = numpy.random.choice(len(game_history.root_values)) + + return position_index, position_prob + + def update_game_history(self, game_id, game_history): + # The element could have been removed since its selection and update + # if next(iter(self.buffer)) <= game_id: + # self.buffer[game_id] = game_history + + self.buffer[game_id] = game_history + + def compute_target_value(self, game_history, index): + # The value target is the discounted root value of the search tree td_steps into the + # future, plus the discounted sum of all rewards until then. + bootstrap_index = index + self.config.td_steps + if bootstrap_index < len(game_history.root_values): + root_values = ( + game_history.root_values + if game_history.reanalysed_predicted_root_values is None + else game_history.reanalysed_predicted_root_values + ) + last_step_value = ( + root_values[bootstrap_index] + if game_history.to_play_history[bootstrap_index] + == game_history.to_play_history[index] + else -root_values[bootstrap_index] + ) + + value = last_step_value * self.config.discount**self.config.td_steps + else: + value = 0 + + for i, reward in enumerate( + game_history.reward_history[index + 1 : bootstrap_index + 1] + ): + # The value is oriented from the perspective of the current player + value += ( + reward + if game_history.to_play_history[index] + == game_history.to_play_history[index + i] + else -reward + ) * self.config.discount**i + + return value + + def make_target(self, game_history, state_index): + """ + Generate targets for every unroll steps. + """ + target_values, target_rewards, target_policies, actions = [], [], [], [] + for current_index in range( + state_index, state_index + self.config.num_unroll_steps + 1 + ): + value = self.compute_target_value(game_history, current_index) + + if current_index < len(game_history.root_values): + target_values.append(value) + target_rewards.append(game_history.reward_history[current_index]) + target_policies.append(game_history.child_visits[current_index]) + actions.append(game_history.action_history[current_index]) + elif current_index == len(game_history.root_values): + target_values.append(0) + target_rewards.append(game_history.reward_history[current_index]) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(game_history.action_history[current_index]) + else: + # States past the end of games are treated as absorbing states + target_values.append(0) + target_rewards.append(0) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(numpy.random.choice(self.config.action_space)) + + return target_values, target_rewards, target_policies, actions diff --git a/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py b/simplifiedMuZero/without_rb/trainer.py similarity index 67% rename from simplifiedMuZero/without_rb/trainer_without_replay_buffer.py rename to simplifiedMuZero/without_rb/trainer.py index e2f64fa2..265b13c5 100644 --- a/simplifiedMuZero/without_rb/trainer_without_replay_buffer.py +++ b/simplifiedMuZero/without_rb/trainer.py @@ -1,21 +1,14 @@ -import copy -import time - import numpy -# import ray import torch +import models -import simplifiedMuZero.without_rb.models_without_replay_buffer as models - - -@ray.remote class Trainer: """ Class which run in a dedicated thread to train a neural network and save it in the shared storage. """ - def __init__(self, initial_checkpoint, config): + def __init__(self, model_cls, initial_checkpoint, config): self.config = config # Fix random generator seed @@ -23,8 +16,8 @@ def __init__(self, initial_checkpoint, config): torch.manual_seed(self.config.seed) # Initialize the network - self.model = models.MuZeroNetwork(self.config) - self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model = model_cls(self.config) + # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) self.model.train() @@ -52,77 +45,29 @@ def __init__(self, initial_checkpoint, config): f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." ) - if initial_checkpoint["optimizer_state"] is not None: - print("Loading optimizer...\n") - self.optimizer.load_state_dict( - copy.deepcopy(initial_checkpoint["optimizer_state"]) - ) - - # update weights 与 continuous update weights 的区别 - # 1. update weights 是实际计算更新network的权重 - # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 - def continuous_update_weights(self, replay_buffer, shared_storage): - # Wait for the replay buffer to be filled - while ray.get(shared_storage.get_info.remote("num_played_games")) < 1: - time.sleep(0.1) - - next_batch = replay_buffer.get_batch.remote() - # Training loop - while self.training_step < self.config.training_steps and not ray.get( - shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 - ): - index_batch, batch = ray.get(next_batch) - next_batch = replay_buffer.get_batch.remote() - self.update_lr() - ( - priorities, - total_loss, - value_loss, - reward_loss, - policy_loss, - ) = self.update_weights(batch) - - if self.config.PER: - # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933) - replay_buffer.update_priorities.remote(priorities, index_batch) - - # Save to the shared storage - if self.training_step % self.config.checkpoint_interval == 0: - shared_storage.set_info.remote( - { - "weights": copy.deepcopy(self.model.get_weights()), - "optimizer_state": copy.deepcopy( - models.dict_to_cpu(self.optimizer.state_dict()) - ), - } - ) - if self.config.save_model: - shared_storage.save_checkpoint.remote() - shared_storage.set_info.remote( - { - "training_step": self.training_step, - "lr": self.optimizer.param_groups[0]["lr"], - "total_loss": total_loss, - "value_loss": value_loss, - "reward_loss": reward_loss, - "policy_loss": policy_loss, - } - ) - - # Managing the self-play / training ratio - if self.config.training_delay: - time.sleep(self.config.training_delay) - if self.config.ratio: - while ( - self.training_step - / max( - 1, ray.get(shared_storage.get_info.remote("num_played_steps")) - ) - > self.config.ratio - and self.training_step < self.config.training_steps - and not ray.get(shared_storage.get_info.remote("terminate")) - ): - time.sleep(0.5) + # if initial_checkpoint["optimizer_state"] is not None: + # print("Loading optimizer...\n") + # self.optimizer.load_state_dict( + # copy.deepcopy(initial_checkpoint["optimizer_state"]) + # ) + + # # update weights 与 continuous update weights 的区别 + # # 1. update weights 是实际计算更新network的权重 + # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + # next_batch = play_buffer.get_batch() + # # Training loop + # while self.training_step < self.config.training_steps and not terminate: + # index_batch, batch = next_batch + # next_batch = play_buffer.get_batch() + # self.update_lr() + # ( + # priorities, + # total_loss, + # value_loss, + # reward_loss, + # policy_loss, + # ) = self.update_weights(batch) def update_weights(self, batch): """ @@ -144,8 +89,6 @@ def update_weights(self, batch): priorities = numpy.zeros_like(target_value_scalar) device = next(self.model.parameters()).device - if self.config.PER: - weight_batch = torch.tensor(weight_batch.copy()).float().to(device) observation_batch = ( torch.tensor(numpy.array(observation_batch)).float().to(device) ) @@ -254,9 +197,7 @@ def update_weights(self, batch): # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss - if self.config.PER: - # Correct PER bias by using importance-sampling (IS) weights - loss *= weight_batch + # Mean over batch dimension (pseudocode do a sum) loss = loss.mean() @@ -297,7 +238,6 @@ def loss_function( # Cross-entropy seems to have a better convergence than MSE value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) - policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( - 1 - ) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1) + return value_loss, reward_loss, policy_loss diff --git a/test/game_play_test.py b/test/game_play_test.py index 60b6a5ec..78fdc4a5 100644 --- a/test/game_play_test.py +++ b/test/game_play_test.py @@ -675,7 +675,15 @@ def loss_function( # print(game_id) # print(game_history.action_history) - # print(game_history.reward_history) + print(game_history.reward_history) + muzero_reward = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == config.muzero_player + ) + + print(muzero_reward) # print(game_history.to_play_history) # # print(game_history.observation_history) # print("child visits", game_history.child_visits) diff --git a/test/mcts_test.py b/test/mcts_test.py new file mode 100644 index 00000000..d3edc0f3 --- /dev/null +++ b/test/mcts_test.py @@ -0,0 +1,245 @@ +import models +from self_play import MCTS, GameHistory, Node, MinMaxStats +from games.tictactoe import MuZeroConfig, Game + +import torch +import numpy +import math + +class MCTS1: + """ + Core Monte Carlo Tree Search algorithm. + To decide on an action, we run N simulations, always starting at the root of + the search tree and traversing the tree according to the UCB formula until we + reach a leaf node. + """ + + def __init__(self, config): + self.config = config + + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action + def run( + self, + model, + observation, + legal_actions, + to_play, + add_exploration_noise, + override_root_with=None, + ): + """ + At the root of the search tree we use the representation function to obtain a + hidden state given the current observation. + We then run a Monte Carlo Tree Search using only action sequences and the model + learned by the network. + """ + print(override_root_with) + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 + root = override_root_with + root_predicted_value = None + else: + root = Node(0) + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + root_predicted_value = models.support_to_scalar( + root_predicted_value, self.config.support_size + ).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + assert ( + legal_actions + ), f"Legal actions should not be an empty array. Got {legal_actions}." + assert set(legal_actions).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + root.expand( + legal_actions, + to_play, + reward, + policy_logits, + hidden_state, + ) + + if add_exploration_noise: + root.add_exploration_noise( + dirichlet_alpha=self.config.root_dirichlet_alpha, + exploration_fraction=self.config.root_exploration_fraction, + ) + + min_max_stats = MinMaxStats() + + max_tree_depth = 0 + for _ in range(self.config.num_simulations): # 开始模拟游戏 + virtual_to_play = to_play + node = root + search_path = [node] + current_tree_depth = 0 + + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 + current_tree_depth += 1 + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 + + # Players play turn by turn + if virtual_to_play + 1 < len(self.config.players): + virtual_to_play = self.config.players[virtual_to_play + 1] + else: + virtual_to_play = self.config.players[0] + + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state + # Inside the search tree we use the dynamics function to obtain the next hidden + # state given an action and the previous hidden state + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent + value, reward, policy_logits, hidden_state = model.recurrent_inference( + parent.hidden_state, + torch.tensor([[action]]).to(parent.hidden_state.device), + ) + value = models.support_to_scalar(value, self.config.support_size).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + node.expand( + self.config.action_space, + virtual_to_play, + reward, + policy_logits, + hidden_state, + ) + + self.backpropagate(search_path, value, virtual_to_play, min_max_stats) + + max_tree_depth = max(max_tree_depth, current_tree_depth) + + extra_info = { + "max_tree_depth": max_tree_depth, + "root_predicted_value": root_predicted_value, + } + return root, extra_info + + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action + def select_child(self, node, min_max_stats): + """ + Select the child with the highest UCB score. + """ + max_ucb = max( + self.ucb_score(node, child, min_max_stats) + for action, child in node.children.items() + ) + action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) + [ + action + for action, child in node.children.items() + if self.ucb_score(node, child, min_max_stats) == max_ucb + ] + ) + return action, node.children[action] + + def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 + """ + The score for a node is based on its value, plus an exploration bonus based on the prior. + """ + pb_c = ( + math.log( + (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 + ) + + self.config.pb_c_init + ) + pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) + + prior_score = pb_c * child.prior # prior 之前的p_value + # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # prior_score = pbc * prior + + if child.visit_count > 0: + # Mean value Q + value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 + child.reward + + self.config.discount # 衰减系数, 之后乘以子节点的值 + * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 + ) + else: + value_score = 0 + + return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 + + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 + """ + At the end of a simulation, we propagate the evaluation all the way up the tree + to the root. + """ + if len(self.config.players) == 1: + for node in reversed(search_path): + node.value_sum += value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * node.value()) + + value = node.reward + self.config.discount * value + + elif len(self.config.players) == 2: + for node in reversed(search_path): + node.value_sum += value if node.to_play == to_play else -value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * -node.value()) + + value = ( + -node.reward if node.to_play == to_play else node.reward + ) + self.config.discount * value + + else: + raise NotImplementedError("More than two player mode not implemented.") + +config = MuZeroConfig() +game = Game(config.seed) + +game_history = GameHistory() + +observation = game.reset() + +game_history.action_history.append(0) +game_history.observation_history.append(observation) # 添加reset之后的observation +game_history.reward_history.append(0) +game_history.to_play_history.append(game.to_play()) + +stacked_observations = game_history.get_stacked_observations( -1, config.stacked_observations, len(config.action_space)) + +done = False + +model = models.MuZeroNetwork(config) + +root, mcts_info = MCTS1(config).run(model, stacked_observations, game.legal_actions(), game.to_play(), True) + +print(root) + +game.close() \ No newline at end of file diff --git a/test/muzero_config_test.py b/test/muzero_config_test.py new file mode 100644 index 00000000..1b5fc135 --- /dev/null +++ b/test/muzero_config_test.py @@ -0,0 +1,6 @@ +from games.simple_grid import MuZeroConfig + +if __name__ == "__main__": + config = MuZeroConfig() + config.results_path /= "config_test" + print(config.results_path) \ No newline at end of file diff --git a/trainer.py b/trainer.py index 3e035c51..849beaa2 100644 --- a/trainer.py +++ b/trainer.py @@ -279,7 +279,7 @@ def update_lr(self): lr = self.config.lr_init * self.config.lr_decay_rate ** ( self.training_step / self.config.lr_decay_steps ) - for param_group in self.optimizer.param_groups: + for param_group in self.optimizer.param_groups: # 更新optimizer的lr param_group["lr"] = lr @staticmethod From 65ac04459be1d3fa04126c2668d7318467d18782 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Fri, 18 Aug 2023 05:55:10 +0100 Subject: [PATCH 5/9] simplified MuZero --- muzero_general.py | 413 +++++++++++++++++++++++++++++++ muzero_without_replay_buffer2.py | 331 +------------------------ simplifiedMuZero/models2.py | 366 +++++++++++++++++++++++---- simplified_muzero.py | 108 ++++++++ simplified_muzero2.py | 108 ++++++++ test/deap_test.py | 44 ++++ 6 files changed, 998 insertions(+), 372 deletions(-) create mode 100644 muzero_general.py create mode 100644 simplified_muzero.py create mode 100644 simplified_muzero2.py create mode 100644 test/deap_test.py diff --git a/muzero_general.py b/muzero_general.py new file mode 100644 index 00000000..6d8363d9 --- /dev/null +++ b/muzero_general.py @@ -0,0 +1,413 @@ +import importlib +import ray +import pathlib + +import numpy +import torch +from torch.utils.tensorboard import SummaryWriter + +import math +import copy + +from simplifiedMuZero.without_rb.game_play import GamePlay +from simplifiedMuZero.without_rb.play_buffer import PlayBuffer +from simplifiedMuZero.without_rb.trainer import Trainer +from muzero import load_model_menu, hyperparameter_search + +import models + + +class CPUActorWithClass: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config, model_cls): + model = model_cls(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + +class MuZeroGeneral: + def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + if save_path_ex: + self.config.results_path /= save_path_ex + else: + self.config.results_path /= model_cls.__name__ + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + self.model_cls = model_cls + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActorWithClass() + cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls) + self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights)) + + + def logging_loop(self, writer, training_steps): + + # print( + # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + # ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # # Save model representation + # writer.add_text( + # "Model summary", + # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 + # ) + # Loop for updating the training performance + counter = training_steps + + try: + if True: + # while checkpoint["training_step"] < config.training_steps: + writer.add_scalar( + "1.Total_reward/1.Total_reward", + self.checkpoint["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + self.checkpoint["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + self.checkpoint["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + self.checkpoint["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + self.checkpoint["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + self.checkpoint["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + self.checkpoint["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter) + print( + f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + # time.sleep(0.5) + except KeyboardInterrupt: + pass + + # if config.save_model: + # # Persist replay buffer to disk + # path = config.results_path / "replay_buffer.pkl" + # print(f"\n\nPersisting replay buffer games to disk at {path}") + # pickle.dump( + # { + # "buffer": buffer, + # "num_played_games": checkpoint["num_played_games"], + # "num_played_steps": checkpoint["num_played_steps"], + # "num_reanalysed_games": checkpoint["num_reanalysed_games"], + # }, + # open(path, "wb"), + # ) + + def update_gameplay_checkpoint(self, game_history): + self.checkpoint["episode_length"] = len(game_history.action_history) - 1 + self.checkpoint["total_reward"] = sum(game_history.reward_history) + self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) + + if 1 < len(self.config.players): + self.checkpoint["muzero_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ) + self.checkpoint["opponent_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ) + + def save_checkpoint(self, path=None): #将模型存储在文件中 + if not path: + path = self.config.results_path / "model.checkpoint" + + torch.save(self.checkpoint, path) + + def train(self, log_in_tensorboard=True): + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + + trainer = Trainer(self.model_cls, self.checkpoint, self.config) + game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed) + buffer = {} + play_buffer = PlayBuffer(self.checkpoint, buffer, self.config) + + step = 1 # 间隔,即每次模拟后训练多少次 + max_steps = int(self.config.training_steps/step) + # max_steps = 2000 + + writer = SummaryWriter(self.config.results_path) + + for episode in range(max_steps): + game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + # print(game_history.reward_history) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + play_buffer.update_game_history(game_id, game_history) + self.update_gameplay_checkpoint( game_history) + + for i in range(step): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = trainer.update_weights(batch) + + + training_step = episode * step + i + if training_step % self.config.checkpoint_interval == 0: + self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) + self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) + + if self.config.save_model: + self.save_checkpoint() + self.checkpoint["training_step"] = training_step + self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] + self.checkpoint["total_loss"] = total_loss + self.checkpoint["value_loss"] = value_loss + self.checkpoint["reward_loss"] = reward_loss + self.checkpoint["policy_loss"] = policy_loss + + # print(training_step) + # if training_step % 500 == 0: + # if training_step % config.checkpoint_interval == 0: + # # print(training_step) + # logging_loop(config, checkpoint, writer) + + self.logging_loop(writer, training_step) + + + writer.close() + + game_play.close_game() + +# if __name__ == "__main__": +# # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") +# # start_time = time.time() +# # muzero.train() +# # end_time = time.time() +# # print("耗时: {:.2f}秒".format(end_time - start_time)) +# model_cls = models.MuZeroNetwork +# if len(sys.argv) == 2: +# # Train directly with: python muzero.py cartpole +# muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) +# muzero.train() +# elif len(sys.argv) == 3: +# # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' +# config = json.loads(sys.argv[2]) +# muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) +# muzero.train() +# else: +# print("\nWelcome to MuZero! Here's a list of games:") +# # Let user pick a game +# games = [ +# filename.stem +# for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) +# if filename.name != "abstract_game.py" +# ] +# for i in range(len(games)): +# print(f"{i}. {games[i]}") +# choice = input("Enter a number to choose the game: ") +# valid_inputs = [str(i) for i in range(len(games))] +# while choice not in valid_inputs: +# choice = input("Invalid input, enter a number listed above: ") +# +# # Initialize MuZero +# choice = int(choice) +# game_name = games[choice] +# muzero = MuZeroGeneral(game_name, model_cls=model_cls) +# +# while True: +# # Configure running options +# options = [ +# "Train", +# "Load pretrained model", +# "Diagnose model", +# "Render some self play games", +# "Play against MuZero", +# "Test the game manually", +# "Hyperparameter search", +# "Exit", +# ] +# print() +# for i in range(len(options)): +# print(f"{i}. {options[i]}") +# +# choice = input("Enter a number to choose an action: ") +# valid_inputs = [str(i) for i in range(len(options))] +# while choice not in valid_inputs: +# choice = input("Invalid input, enter a number listed above: ") +# choice = int(choice) +# if choice == 0: +# start_time = time.time() +# muzero.train() +# end_time = time.time() +# print("耗时: {:.2f}秒".format(end_time - start_time)) +# elif choice == 1: +# load_model_menu(muzero, game_name) +# elif choice == 2: +# muzero.diagnose_model(30) +# elif choice == 3: +# muzero.test(render=True, opponent="self", muzero_player=None) +# elif choice == 4: +# muzero.test(render=True, opponent="human", muzero_player=0) +# elif choice == 5: +# env = muzero.Game() +# env.reset() +# env.render() +# +# done = False +# while not done: +# action = env.human_to_action() +# observation, reward, done = env.step(action) +# print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") +# env.render() +# elif choice == 6: +# # Define here the parameters to tune +# # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html +# muzero.terminate_workers() +# del muzero +# budget = 20 +# parallel_experiments = 2 +# lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) +# discount = nevergrad.p.Log(lower=0.95, upper=0.9999) +# parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) +# best_hyperparameters = hyperparameter_search( +# game_name, parametrization, budget, parallel_experiments, 20 +# ) +# muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) +# else: +# break +# print("\nDone") diff --git a/muzero_without_replay_buffer2.py b/muzero_without_replay_buffer2.py index ebbb147f..4b87fc7b 100644 --- a/muzero_without_replay_buffer2.py +++ b/muzero_without_replay_buffer2.py @@ -1,321 +1,12 @@ -import pathlib -import importlib -import ray - -import numpy -import torch -from torch.utils.tensorboard import SummaryWriter -import pickle +import models +from muzero_general import MuZeroGeneral +from muzero import load_model_menu, hyperparameter_search -import math +import json +import sys +import pathlib import time -import copy import nevergrad -import sys -import json - -from simplifiedMuZero.without_rb.game_play import GamePlay -from simplifiedMuZero.without_rb.play_buffer import PlayBuffer -from simplifiedMuZero.without_rb.trainer import Trainer -from muzero import load_model_menu, hyperparameter_search - -import models - - -class CPUActorWithClass: - # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU - def __init__(self): - pass - - def get_initial_weights(self, config, model_cls): - model = model_cls(config) - weigths = model.get_weights() - summary = str(model).replace("\n", " \n\n") - return weigths, summary - -class MuZeroWithoutRB: - def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None): - # Load the game and the config from the module with the game name - try: - game_module = importlib.import_module("games." + game_name) - print("games." + game_name) - self.Game = game_module.Game - self.config = game_module.MuZeroConfig() - if save_path_ex: - config.results_path /= save_path_ex - except ModuleNotFoundError as err: - print( - f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' - ) - raise err - - self.model_cls = model_cls - - # Overwrite the config - if config: - if type(config) is dict: - for param, value in config.items(): - if hasattr(self.config, param): - setattr(self.config, param, value) - else: - raise AttributeError( - f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." - ) - else: - self.config = config - - # Fix random generator seed - numpy.random.seed(self.config.seed) - torch.manual_seed(self.config.seed) - - # Manage GPUs - if self.config.max_num_gpus == 0 and ( - self.config.selfplay_on_gpu - or self.config.train_on_gpu - or self.config.reanalyse_on_gpu - ): - raise ValueError( - "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." - ) - if ( - self.config.selfplay_on_gpu - or self.config.train_on_gpu - or self.config.reanalyse_on_gpu - ): - total_gpus = ( - self.config.max_num_gpus - if self.config.max_num_gpus is not None - else torch.cuda.device_count() - ) - else: - total_gpus = 0 - self.num_gpus = total_gpus / split_resources_in - if 1 < self.num_gpus: - self.num_gpus = math.floor(self.num_gpus) - - # Checkpoint and replay buffer used to initialize workers - self.checkpoint = { - "weights": None, - "optimizer_state": None, - "total_reward": 0, - "muzero_reward": 0, - "opponent_reward": 0, - "episode_length": 0, - "mean_value": 0, - "training_step": 0, - "lr": 0, - "total_loss": 0, - "value_loss": 0, - "reward_loss": 0, - "policy_loss": 0, - "num_played_games": 0, - "num_played_steps": 0, - "num_reanalysed_games": 0, - "terminate": False, - } - self.replay_buffer = {} - - cpu_actor = CPUActorWithClass() - cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls) - self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights)) - - - def logging_loop(self, writer, training_steps): - # writer = SummaryWriter(config.results_path) - - # print( - # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" - # ) - - # Save hyperparameters to TensorBoard - hp_table = [ - f"| {key} | {value} |" for key, value in self.config.__dict__.items() - ] - writer.add_text( - "Hyperparameters", - "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), - ) - # # Save model representation - # writer.add_text( - # "Model summary", - # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 - # ) - # Loop for updating the training performance - counter = training_steps - - try: - if True: - # while checkpoint["training_step"] < config.training_steps: - writer.add_scalar( - "1.Total_reward/1.Total_reward", - self.checkpoint["total_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/2.Mean_value", - self.checkpoint["mean_value"], - counter, - ) - writer.add_scalar( - "1.Total_reward/3.Episode_length", - self.checkpoint["episode_length"], - counter, - ) - writer.add_scalar( - "1.Total_reward/4.MuZero_reward", - self.checkpoint["muzero_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/5.Opponent_reward", - self.checkpoint["opponent_reward"], - counter, - ) - writer.add_scalar( - "2.Workers/1.Self_played_games", - self.checkpoint["num_played_games"], - counter, - ) - writer.add_scalar( - "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter - ) - writer.add_scalar( - "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter - ) - writer.add_scalar( - "2.Workers/4.Reanalysed_games", - self.checkpoint["num_reanalysed_games"], - counter, - ) - writer.add_scalar( - "2.Workers/5.Training_steps_per_self_played_step_ratio", - self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]), - counter, - ) - writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter) - writer.add_scalar( - "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter - ) - writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter) - writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter) - writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter) - print( - f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}', - end="\r", - ) - counter += 1 - # time.sleep(0.5) - except KeyboardInterrupt: - pass - - # if config.save_model: - # # Persist replay buffer to disk - # path = config.results_path / "replay_buffer.pkl" - # print(f"\n\nPersisting replay buffer games to disk at {path}") - # pickle.dump( - # { - # "buffer": buffer, - # "num_played_games": checkpoint["num_played_games"], - # "num_played_steps": checkpoint["num_played_steps"], - # "num_reanalysed_games": checkpoint["num_reanalysed_games"], - # }, - # open(path, "wb"), - # ) - - def update_gameplay_checkpoint(self, game_history): - self.checkpoint["episode_length"] = len(game_history.action_history) - 1 - self.checkpoint["total_reward"] = sum(game_history.reward_history) - self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) - - if 1 < len(self.config.players): - self.checkpoint["muzero_reward"] = sum( - reward - for i, reward in enumerate(game_history.reward_history) - if game_history.to_play_history[i - 1] - == self.config.muzero_player - ) - self.checkpoint["opponent_reward"] = sum( - reward - for i, reward in enumerate(game_history.reward_history) - if game_history.to_play_history[i - 1] - != self.config.muzero_player - ) - - def save_checkpoint(self, path=None): #将模型存储在文件中 - if not path: - path = self.config.results_path / "model.checkpoint" - - torch.save(self.checkpoint, path) - - def train(self, log_in_tensorboard=True): - if log_in_tensorboard or self.config.save_model: - self.config.results_path.mkdir(parents=True, exist_ok=True) - - - trainer = Trainer(models.MuZeroNetwork, self.checkpoint, self.config) - game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed) - buffer = {} - play_buffer = PlayBuffer(self.checkpoint, buffer, self.config) - - step = 1 # 间隔,即每次模拟后训练多少次 - max_steps = int(self.config.training_steps/step) - # max_steps = 2000 - - writer = SummaryWriter(self.config.results_path) - - for episode in range(max_steps): - game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0) - - # print(game_id) - # print(game_history.action_history) - # print(game_history.reward_history) - # print(game_history.to_play_history) - # # print(game_history.observation_history) - # print("child visits", game_history.child_visits) - # print(game_history.root_values) # root value指的是root节点的UCB值 - - play_buffer.update_game_history(game_id, game_history) - self.update_gameplay_checkpoint( game_history) - - for i in range(step): - index_batch, batch = play_buffer.get_batch() - # print(batch[1]) - trainer.update_lr() - ( - priorities, - total_loss, - value_loss, - reward_loss, - policy_loss, - ) = trainer.update_weights(batch) - - - training_step = episode * step + i - if training_step % self.config.checkpoint_interval == 0: - self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) - self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) - - if self.config.save_model: - self.save_checkpoint() - self.checkpoint["training_step"] = training_step - self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] - self.checkpoint["total_loss"] = total_loss - self.checkpoint["value_loss"] = value_loss - self.checkpoint["reward_loss"] = reward_loss - self.checkpoint["policy_loss"] = policy_loss - - # print(training_step) - # if training_step % 500 == 0: - # if training_step % config.checkpoint_interval == 0: - # # print(training_step) - # logging_loop(config, checkpoint, writer) - - self.logging_loop(writer, training_step) - - - writer.close() - - game_play.close_game() if __name__ == "__main__": # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") @@ -326,12 +17,12 @@ def train(self, log_in_tensorboard=True): model_cls = models.MuZeroNetwork if len(sys.argv) == 2: # Train directly with: python muzero.py cartpole - muzero = MuZeroWithoutRB(sys.argv[1], model_cls=model_cls) + muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) muzero.train() elif len(sys.argv) == 3: # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' config = json.loads(sys.argv[2]) - muzero = MuZeroWithoutRB(sys.argv[1], config, model_cls=model_cls) + muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) muzero.train() else: print("\nWelcome to MuZero! Here's a list of games:") @@ -351,7 +42,7 @@ def train(self, log_in_tensorboard=True): # Initialize MuZero choice = int(choice) game_name = games[choice] - muzero = MuZeroWithoutRB(game_name, model_cls=model_cls) + muzero = MuZeroGeneral(game_name, model_cls=model_cls) while True: # Configure running options @@ -411,7 +102,7 @@ def train(self, log_in_tensorboard=True): best_hyperparameters = hyperparameter_search( game_name, parametrization, budget, parallel_experiments, 20 ) - muzero = MuZeroWithoutRB(game_name, best_hyperparameters , model_cls=model_cls) + muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) else: break - print("\nDone") + print("\nDone") \ No newline at end of file diff --git a/simplifiedMuZero/models2.py b/simplifiedMuZero/models2.py index 4fb55bad..fd6aa6ee 100644 --- a/simplifiedMuZero/models2.py +++ b/simplifiedMuZero/models2.py @@ -3,12 +3,13 @@ import torch -from models import * +from models import support_to_scalar, scalar_to_support, mlp, AbstractNetwork, conv3x3, RepresentationNetwork, DynamicsNetwork, PredictionNetwork -class SimplifiedMuZeroNetwork: +class MuZeroNetwork_2net: def __new__(cls, config): + print("MuZeroNetwork_2net") if config.network == "fullyconnected": - return SimplifiedMuZeroFullyConnectedNetwork( + return MuZeroFullyConnectedNetwork_2net( config.observation_shape, config.stacked_observations, len(config.action_space), @@ -21,7 +22,8 @@ def __new__(cls, config): config.support_size, ) elif config.network == "resnet": - return MuZeroResidualNetwork( + print("resnet") + return MuZeroResidualNetwork_2net( config.observation_shape, config.stacked_observations, len(config.action_space), @@ -40,64 +42,70 @@ def __new__(cls, config): raise NotImplementedError( 'The network parameter should be "fullyconnected" or "resnet".' ) -class SimplifiedMuZeroFullyConnectedNetwork(AbstractNetwork): - def __init__(self, - observation_shape, - stacked_observations, - action_space_size, - encoding_size, - fc_reward_layers, - fc_value_layers, - fc_policy_layers, - fc_representation_layers, - fc_dynamics_layers, - support_size, - ): +class MuZeroFullyConnectedNetwork_2net(AbstractNetwork): + def __init__( + self, + observation_shape, + stacked_observations, + action_space_size, + encoding_size, + fc_reward_layers, + fc_value_layers, + fc_policy_layers, + fc_representation_layers, + fc_dynamics_layers, + support_size, + ): super().__init__() - # 动作空间大小 self.action_space_size = action_space_size - #为什么是2*support_size +1 self.full_support_size = 2 * support_size + 1 - representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * (stacked_observations + 1)\ + # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数 + + representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * ( + stacked_observations + 1) \ + stacked_observations * observation_shape[1] * observation_shape[2] - # 改进方法: - # 1. input size = encoding _size - # 2. input 后边加上 action space - self.representation_network = torch.nn.DataParallel( - mlp( - representation_input_size, - fc_representation_layers, - encoding_size - ) - ) + # 输出等于输入,即编码维度等于输入维度 + encoding_size = representation_input_size + + # self.representation_network = torch.nn.DataParallel( + # # mlp( + # # representation_input_size, + # # fc_representation_layers, + # # encoding_size, + # # ) + # mlp( + # representation_input_size + self.action_space_size, + # fc_representation_layers, + # encoding_size, + # ) + # ) - self.dynamic_encoded_state_network = torch.nn.DataParallel( + #dynamics的输入是encoding_size+action_space_size + self.dynamics_encoded_state_network = torch.nn.DataParallel( mlp( - encoding_size +self.action_space_size, + encoding_size + self.action_space_size, fc_dynamics_layers, - encoding_size + encoding_size, ) ) - self.dynamics_reward_network = torch.nn.DataParallel( - mlp(encoding_size, fc_reward_layers, self.full_support_size) + mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] ) - self.prediction_polic_network = torch.nn.DataParallel( - mlp(encoding_size, fc_policy_layers, self.action_space_size) + self.prediction_policy_network = torch.nn.DataParallel( + mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率 ) - self.prediction_value_network = torch.nn.DataParallel( - mlp(encoding_size, fc_value_layers, self.full_support_size) + mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] ) - def prediction(self, encode_state): - policy_logits = self.prediction_polic_network(encode_state) - value = self.prediction_value_network(encode_state) + def prediction(self, encoded_state): + policy_logits = self.prediction_policy_network(encoded_state) + value = self.prediction_value_network(encoded_state) return policy_logits, value - # 将encoded_stated标准化 + # 将encoded_stated标准化 def encoded_stated_normalized(self, encoded_state): min_encoded_state = encoded_state.min(1, keepdim=True)[0] max_encoded_state = encoded_state.max(1, keepdim=True)[0] @@ -106,11 +114,17 @@ def encoded_stated_normalized(self, encoded_state): encoded_state_normalized = (encoded_state - min_encoded_state) / scale_encoded_state return encoded_state_normalized - def representation(self, observation): - encoded_state = self.representation_network( - observation.view(observation.shape[0], -1) - ) + observation = observation.view(observation.shape[0], -1) + action_zeros = (torch.zeros((observation.shape[0], self.action_space_size)).to(observation.device).float()) + x = torch.cat((observation, action_zeros), dim=1) + + # encoded_state = self.representation_network(x) + encoded_state = self.dynamics_encoded_state_network(x) + + # encoded_state = self.representation_network( + # observation.view(observation.shape[0], -1) + # ) return self.encoded_stated_normalized(encoded_state) @@ -120,10 +134,9 @@ def dynamics(self, encoded_state, action): action_one_hot.scatter(1, action.long(), 1.0) x = torch.cat((encoded_state, action_one_hot), dim=1) - next_encoded_state = self.dynamic_encoded_state_network(x) + next_encoded_state = self.dynamics_encoded_state_network(x) reward = self.dynamics_reward_network(next_encoded_state) - next_encoded_state_normalized = self.encoded_stated_normalized(next_encoded_state) return next_encoded_state_normalized, reward @@ -131,8 +144,7 @@ def dynamics(self, encoded_state, action): def initial_inference(self, observation): encoded_state = self.representation(observation) policy_logits, value = self.prediction(encoded_state) - - # reward的样子为[[0,0,...,0,1,0,...,0,0],...]。即中间值为1,其余全为0,然后重复于observation行数相同的次数 + # reward equal to 0 for consistency 一致性奖励等于 0 reward = torch.log( ( torch.zeros(1, self.full_support_size) @@ -141,8 +153,258 @@ def initial_inference(self, observation): .to(observation.device) ) ) + # reward的样子为[[0,0,...,0,1,0,...,0,0],...]。即中间值为1,其余全为0,然后重复于observation行数相同的次数 + + return ( + value, + reward, + policy_logits, + encoded_state, + ) + + def recurrent_inference(self, encoded_state, action): + next_encoded_state, reward = self.dynamics(encoded_state, action) + policy_logits, value = self.prediction(next_encoded_state) + return value, reward, policy_logits, next_encoded_state + +class MuZeroResidualNetwork_2net(AbstractNetwork): + def __init__( + self, + observation_shape, + stacked_observations, # stacken_observations表示先去观察的数量,用在那些需要历史信息的游戏里。如果不需要历史观察,该值为0 + action_space_size, + num_blocks, + num_channels, + reduced_channels_reward, + reduced_channels_value, + reduced_channels_policy, + fc_reward_layers, + fc_value_layers, + fc_policy_layers, + support_size, + downsample, + ): + super().__init__() + print("observation shape is ", observation_shape) + print("num channels is ", num_channels) + + num_channels = observation_shape[1] + self.action_space_size = action_space_size + self.full_support_size = 2 * support_size + 1 + block_output_size_reward = ( + ( + reduced_channels_reward + * math.ceil(observation_shape[1] / 16) + * math.ceil(observation_shape[2] / 16) + ) + if downsample + else (reduced_channels_reward * observation_shape[1] * observation_shape[2]) + ) + + # observations_shape存放的时观察值的维度形状,第0维时观察的当前和历史维度,后面几维是观察值 + block_output_size_value = ( + ( + reduced_channels_value + * math.ceil(observation_shape[1] / 16) + * math.ceil(observation_shape[2] / 16) + ) + if downsample + else (reduced_channels_value * observation_shape[1] * observation_shape[2]) + ) + + block_output_size_policy = ( + ( + reduced_channels_policy + * math.ceil(observation_shape[1] / 16) + * math.ceil(observation_shape[2] / 16) + ) + if downsample + else (reduced_channels_policy * observation_shape[1] * observation_shape[2]) + ) + + # self.representation_network = torch.nn.DataParallel( + # RepresentationNetwork( + # observation_shape, + # stacked_observations, + # num_blocks, + # num_channels, + # downsample, + # ) + # ) + + self.dynamics_network = torch.nn.DataParallel( + DynamicsNetwork( + num_blocks, + num_channels + 1, + reduced_channels_reward, + fc_reward_layers, + self.full_support_size, + block_output_size_reward, + ) + ) + + self.prediction_network = torch.nn.DataParallel( + PredictionNetwork( + action_space_size, + num_blocks, + num_channels, + reduced_channels_value, + reduced_channels_policy, + fc_value_layers, + fc_policy_layers, + self.full_support_size, + block_output_size_value, + block_output_size_policy, + ) + ) + + def prediction(self, encoded_state): + # print("encoded_state shape is : " , encoded_state.shape) + policy, value = self.prediction_network(encoded_state) + return policy, value + + # def representation(self, observation): + # # print("observation shape is : ", observation.shape) + # encoded_state = self.representation_network(observation) + # + # # Scale encoded state between [0, 1] (See appendix paper Training) + # min_encoded_state = ( + # encoded_state.view( + # -1, + # encoded_state.shape[1], + # encoded_state.shape[2] * encoded_state.shape[3], + # ) + # .min(2, keepdim=True)[0] + # .unsqueeze(-1) + # ) + # max_encoded_state = ( + # encoded_state.view( + # -1, + # encoded_state.shape[1], + # encoded_state.shape[2] * encoded_state.shape[3], + # ) + # .max(2, keepdim=True)[0] + # .unsqueeze(-1) + # ) + # scale_encoded_state = max_encoded_state - min_encoded_state + # scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 + # encoded_state_normalized = ( + # encoded_state - min_encoded_state + # ) / scale_encoded_state + # return encoded_state_normalized + + def representation(self, encoded_state): + # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture) + action_one_hot = ( + torch.ones( + ( + encoded_state.shape[0], + 1, + encoded_state.shape[2], + encoded_state.shape[3], + ) + ) + .to(encoded_state.device) + .float() + ) + # action_one_hot = ( + # action[:, :, None, None] * action_one_hot / self.action_space_size + # ) + x = torch.cat((encoded_state, action_one_hot), dim=1) + next_encoded_state, _ = self.dynamics_network(x) # 第二个参数是reward,在表示网络不需要它 + + # Scale encoded state between [0, 1] (See paper appendix Training) + min_next_encoded_state = ( + next_encoded_state.view( + -1, + next_encoded_state.shape[1], + next_encoded_state.shape[2] * next_encoded_state.shape[3], + ) + .min(2, keepdim=True)[0] + .unsqueeze(-1) + ) + max_next_encoded_state = ( + next_encoded_state.view( + -1, + next_encoded_state.shape[1], + next_encoded_state.shape[2] * next_encoded_state.shape[3], + ) + .max(2, keepdim=True)[0] + .unsqueeze(-1) + ) + scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state + scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 + next_encoded_state_normalized = ( + next_encoded_state - min_next_encoded_state + ) / scale_next_encoded_state + return next_encoded_state_normalized - return (value, reward, policy_logits, encoded_state) + def dynamics(self, encoded_state, action): + # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture) + action_one_hot = ( + torch.ones( + ( + encoded_state.shape[0], + 1, + encoded_state.shape[2], + encoded_state.shape[3], + ) + ) + .to(action.device) + .float() + ) + action_one_hot = ( + action[:, :, None, None] * action_one_hot / self.action_space_size + ) + x = torch.cat((encoded_state, action_one_hot), dim=1) + next_encoded_state, reward = self.dynamics_network(x) + + # Scale encoded state between [0, 1] (See paper appendix Training) + min_next_encoded_state = ( + next_encoded_state.view( + -1, + next_encoded_state.shape[1], + next_encoded_state.shape[2] * next_encoded_state.shape[3], + ) + .min(2, keepdim=True)[0] + .unsqueeze(-1) + ) + max_next_encoded_state = ( + next_encoded_state.view( + -1, + next_encoded_state.shape[1], + next_encoded_state.shape[2] * next_encoded_state.shape[3], + ) + .max(2, keepdim=True)[0] + .unsqueeze(-1) + ) + scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state + scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 + next_encoded_state_normalized = ( + next_encoded_state - min_next_encoded_state + ) / scale_next_encoded_state + return next_encoded_state_normalized, reward + + def initial_inference(self, observation): + encoded_state = self.representation(observation) + # action = torch.tensor([[0]]).to(observation.device) + # encoded_state = self.dynamics(observation, action) + policy_logits, value = self.prediction(encoded_state) + # reward equal to 0 for consistency + reward = torch.log( + ( + torch.zeros(1, self.full_support_size) + .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1 + .repeat(len(observation), 1) # 根据observation的长度复制,保证reward的维度于observation的一致,即之前的observation也赋值 + .to(observation.device) + ) + ) + return ( + value, + reward, + policy_logits, + encoded_state, + ) def recurrent_inference(self, encoded_state, action): next_encoded_state, reward = self.dynamics(encoded_state, action) diff --git a/simplified_muzero.py b/simplified_muzero.py new file mode 100644 index 00000000..cd99153e --- /dev/null +++ b/simplified_muzero.py @@ -0,0 +1,108 @@ +from simplifiedMuZero.net2.models_2net import SimplifiedMuZeroNetwork +from muzero_general import MuZeroGeneral +from muzero import load_model_menu, hyperparameter_search + +import json +import sys +import pathlib +import time +import nevergrad + +if __name__ == "__main__": + # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") + # start_time = time.time() + # muzero.train() + # end_time = time.time() + # print("耗时: {:.2f}秒".format(end_time - start_time)) + model_cls = SimplifiedMuZeroNetwork + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZeroGeneral(game_name, model_cls=model_cls) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) + else: + break + print("\nDone") \ No newline at end of file diff --git a/simplified_muzero2.py b/simplified_muzero2.py new file mode 100644 index 00000000..a136dd44 --- /dev/null +++ b/simplified_muzero2.py @@ -0,0 +1,108 @@ +from simplifiedMuZero.models2 import MuZeroNetwork_2net +from muzero_general import MuZeroGeneral +from muzero import load_model_menu, hyperparameter_search + +import json +import sys +import pathlib +import time +import nevergrad + +if __name__ == "__main__": + # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") + # start_time = time.time() + # muzero.train() + # end_time = time.time() + # print("耗时: {:.2f}秒".format(end_time - start_time)) + model_cls = MuZeroNetwork_2net + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZeroGeneral(game_name, model_cls=model_cls) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) + else: + break + print("\nDone") \ No newline at end of file diff --git a/test/deap_test.py b/test/deap_test.py new file mode 100644 index 00000000..0ec02e8e --- /dev/null +++ b/test/deap_test.py @@ -0,0 +1,44 @@ +import random + +import deap +from games.tictactoe import Game, MuZeroConfig +import numpy as np + +config = MuZeroConfig() +print(config.max_moves) + +from deap import base, creator, tools +import numpy as np +# 定义问题 +creator.create('FitnessMax', base.Fitness, weights=(-1.0,)) #优化目标:单变量,求最小值 +creator.create('Individual', list, fitness = creator.FitnessMax) #创建Individual类,继承list + +legal_actions = 9 + +toolbox = base.Toolbox() +toolbox.register("Indices", random.sample, range(legal_actions), legal_actions) +toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Indices) + +ind1 = toolbox.Individual() +print(ind1) + +toolbox.register("population", tools.initRepeat, list, toolbox.Individual) + +pop = toolbox.population(n=36) +print(len(pop)) + +def ea(game): + pass + +# game = Game(0) +# game.reset() +# +# for i in range(9): +# game.render() +# print(game.legal_actions()) +# observation, reward, done = game.step(np.random.choice(game.legal_actions())) +# +# if done: +# break +# +# game.render() From 885308edef158a45c55ae2695de9f06a80a63c2b Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Tue, 22 Aug 2023 20:14:38 +0100 Subject: [PATCH 6/9] parameter optimization --- game_tournament.py | 252 ++++- games/simple_grid.py | 2 + games/tictactoe.py | 3 +- muzero_2net.py | 8 +- muzero_no_pv.py | 716 +++++++++++++ muzero_rhea.py | 719 +++++++++++++ muzero_uniform.py | 4 +- muzero_without_replay_buffer.py | 964 ++---------------- muzero_without_replay_buffer2.py | 108 -- simplifiedMuZero/net2/__init__.py | 0 simplifiedMuZero/{ => net2}/models2.py | 9 +- simplifiedMuZero/net2/replay_buffer_2net.py | 7 +- simplifiedMuZero/net2/self_play_2net.py | 6 +- simplifiedMuZero/net2/trainer_2net.py | 6 +- simplifiedMuZero/no_pv/trainer_no_pv.py | 301 ++++++ simplifiedMuZero/search_policy/RHEA.py | 83 +- simplifiedMuZero/search_policy/RHEA2.py | 192 ++++ .../search_policy/rhea_self_play.py | 227 +++++ simplified_muzero.py | 4 +- simplified_muzero2.py | 108 -- test/deap_test.py | 108 +- test/deap_test2.py | 119 +++ test/load_model.py | 12 + 23 files changed, 2792 insertions(+), 1166 deletions(-) create mode 100644 muzero_no_pv.py create mode 100644 muzero_rhea.py delete mode 100644 muzero_without_replay_buffer2.py create mode 100644 simplifiedMuZero/net2/__init__.py rename simplifiedMuZero/{ => net2}/models2.py (98%) create mode 100644 simplifiedMuZero/no_pv/trainer_no_pv.py create mode 100644 simplifiedMuZero/search_policy/RHEA2.py create mode 100644 simplifiedMuZero/search_policy/rhea_self_play.py delete mode 100644 simplified_muzero2.py create mode 100644 test/deap_test2.py create mode 100644 test/load_model.py diff --git a/game_tournament.py b/game_tournament.py index 918beac3..9e8499e5 100644 --- a/game_tournament.py +++ b/game_tournament.py @@ -6,8 +6,8 @@ from games.tictactoe import MuZeroConfig, Game import models +import simplifiedMuZero.net2.models2 as models2 from self_play import MCTS, GameHistory,SelfPlay -from simplifiedMuZero.search_policy.self_play_uniform_search import UniformSearch class GameTournament: def __init__(self, config:MuZeroConfig): @@ -107,6 +107,73 @@ def play_competition(self, model1, search_policy1, model2, search_policy2): # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 return self.game.env.have_winner(), is_model1 == (reward > 0) + def play_with_expert(self, model, search_policy, expert_first=True): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model.eval() + + is_model = not expert_first + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + + if is_model: + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + else: + action = self.game.expert_agent() + root = None + + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model = not is_model + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model == (reward > 0) + def close_game(self): self.game.close() @@ -124,7 +191,7 @@ def play_tournament(self, models, rollnum=1000): no_winner_num = 0 for _ in range(rollnum): - have_winner, is_model1 = game_tournament.play_competition(model1, MCTS, model2, MCTS) + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) if have_winner: if is_model1: @@ -134,30 +201,133 @@ def play_tournament(self, models, rollnum=1000): else: no_winner_num += 1 - # 交换顺序,再来一遍 + # # 交换顺序,再来一遍 + # for _ in range(rollnum): + # have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS) + # + # if have_winner: + # if is_model1: + # model2_win_num += 1 + # else: + # model1_win_num += 1 + # else: + # no_winner_num += 1 + + # print(is_model1) + + print(models[i]["name"]," ,", models[j]["name"]," : ") + + print(models[i]["name"], " win : ", model1_win_num) + print(models[j]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + for _ in range(rollnum): - have_winner, is_model1 = game_tournament.play_competition(model2, MCTS, model1, MCTS) + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) if have_winner: if is_model1: - model2_win_num += 1 - else: model1_win_num += 1 + else: + model2_win_num += 1 else: no_winner_num += 1 - # print(is_model1) - print(models[i]["name"]," ,", models[j]["name"]," : ") + print(models[j]["name"]," ,", models[i]["name"]," : ") - print(models[i]["name"], " win : ", model1_win_num) - print(models[j]["name"], " win : ", model2_win_num) + print(models[j]["name"], " win : ", model1_win_num) + print(models[i]["name"], " win : ", model2_win_num) print("No Winner", no_winner_num) print("===================================") + def play_tournament_with_expert(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + model = models[i]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + + print(models[i]["name"], " ,", "expert : ") + + print(models[i]["name"], " win : ", model_win_num) + print("expert win : ", expert_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + for _ in range(rollnum): + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + print("expert : ", " ,", models[i]["name"]) + + print("expert win : ", expert_win_num) + print(models[i]["name"], " win : ", model_win_num) + print("No Winner", no_winner_num) + print("===================================") -def load_model(model_cls, model_path): + +def load_model(model_cls, model_path, config): checkpoint = torch.load(model_path) model = model_cls(config) model.set_weights(checkpoint["weights"]) @@ -168,17 +338,32 @@ def load_model(model_cls, model_path): if __name__ == "__main__": config = MuZeroConfig() - checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" - muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1) + # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--09-40-26\model.checkpoint" + muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config) + + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) - muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path) + muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" + muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config) uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" - uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path) + uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" - without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path) + without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) + + muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) + + + simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) + + # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" + # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) + game_tournament = GameTournament(config) @@ -187,35 +372,14 @@ def load_model(model_cls, model_path): {"name":"uniform", "model":uniform_model}, {"name":"muzero", "model":muzero_model}, {"name": "without_rb", "model": without_rb_model}, + {"name": "no policy value", "model": muzero_no_policy_model}, + {"name": "simplified_muzero", "model": without_rb_model}, ] - # rollnum = 1000 - # - # # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) - # model1_win_num = 0 - # model2_win_num = 0 - # no_winner_num = 0 - # - # for i in range(rollnum): - # have_winner, is_model1 = game_tournament.play_competition(muzero_2net_model, MCTS, uniform_model, MCTS) - # - # if have_winner: - # if is_model1: - # model1_win_num += 1 - # else: - # model2_win_num += 1 - # else: - # no_winner_num += 1 - # - # # print(is_model1) - # - # print(model1_win_num) - # print(model2_win_num) - # print(no_winner_num) - - game_tournament.play_tournament(models, rollnum=100) - game_tournament.close_game() + # game_tournament.play_tournament(models, rollnum=1000) + game_tournament.play_tournament(models, rollnum=10) + game_tournament.play_tournament_with_expert(models, rollnum=100) + game_tournament.close_game() - # print(checkpoint) diff --git a/games/simple_grid.py b/games/simple_grid.py index f26ae429..d163d7de 100644 --- a/games/simple_grid.py +++ b/games/simple_grid.py @@ -23,6 +23,8 @@ def __init__(self): self.players = list(range(1)) # List of players. You should only edit the length self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + self.action_replace = True + # Evaluate self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class diff --git a/games/tictactoe.py b/games/tictactoe.py index c2529d5d..787986fb 100644 --- a/games/tictactoe.py +++ b/games/tictactoe.py @@ -27,7 +27,8 @@ def __init__(self): self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) self.opponent = "expert" # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class - + # 动作是否能重复 + self.action_replace = False ### Self-Play self.num_workers = 1 # Number of simultaneous threads/workers self-playing to feed the replay buffer diff --git a/muzero_2net.py b/muzero_2net.py index 39438acd..642602da 100644 --- a/muzero_2net.py +++ b/muzero_2net.py @@ -16,7 +16,9 @@ sys.path.append("") import diagnose_model -import simplifiedMuZero.net2.models_2net as models +# import simplifiedMuZero.net2.models_2net as models +import models +from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net import simplifiedMuZero.net2.replay_buffer_2net as replay_buffer import simplifiedMuZero.net2.self_play_2net as self_play import shared_storage @@ -69,6 +71,7 @@ def __init__(self, game_name, config=None, split_resources_in=1): # 重命名路径,以便区分不同的模型 self.config.results_path /= "muzero_2net" + self.config.training_steps = 100000 # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) @@ -491,7 +494,8 @@ def __init__(self): pass def get_initial_weights(self, config): - model = models.SimplifiedMuZeroNetwork(config) + # model = models.SimplifiedMuZeroNetwork(config) + model = MuZeroNetwork_2net(config) weigths = model.get_weights() summary = str(model).replace("\n", " \n\n") return weigths, summary diff --git a/muzero_no_pv.py b/muzero_no_pv.py new file mode 100644 index 00000000..e94789ed --- /dev/null +++ b/muzero_no_pv.py @@ -0,0 +1,716 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +import diagnose_model +import models +import replay_buffer +import self_play +import shared_storage +import simplifiedMuZero.no_pv.trainer_no_pv as trainer + + +class MuZero: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.MuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_rhea.py b/muzero_rhea.py new file mode 100644 index 00000000..07ceee18 --- /dev/null +++ b/muzero_rhea.py @@ -0,0 +1,719 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +import diagnose_model +import models +import replay_buffer +import simplifiedMuZero.search_policy.rhea_self_play as self_play +import shared_storage +import trainer + + +class MuZero_Rhea: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero_Rhea("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # 重命名路径,以便区分不同的模型 + self.config.results_path /= self.__class__.__name__ + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlayRhea.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlayRhea.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlayRhea.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.MuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_Rhea(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_Rhea(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero_Rhea): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero_Rhea(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero_Rhea(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero_Rhea(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero_Rhea(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_uniform.py b/muzero_uniform.py index 24a9e09b..53d4a0b9 100644 --- a/muzero_uniform.py +++ b/muzero_uniform.py @@ -16,7 +16,8 @@ import diagnose_model import models import replay_buffer -import simplifiedMuZero.search_policy.self_play_uniform_search as self_play +import self_play +# import simplifiedMuZero.search_policy.self_play_uniform_search as self_play import shared_storage import trainer @@ -67,6 +68,7 @@ def __init__(self, game_name, config=None, split_resources_in=1): # 重命名路径,以便区分不同的模型 self.config.results_path /= "muzero_uniform" + self.config.temperature_threshold = 0 # Fix random generator seed numpy.random.seed(self.config.seed) diff --git a/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py index 2eba36a0..4b87fc7b 100644 --- a/muzero_without_replay_buffer.py +++ b/muzero_without_replay_buffer.py @@ -1,870 +1,108 @@ -from self_play import MCTS, GameHistory -from games.simple_grid import MuZeroConfig, Game -# from games.tictactoe import MuZeroConfig, Game import models +from muzero_general import MuZeroGeneral +from muzero import load_model_menu, hyperparameter_search -import numpy -import torch -from torch.utils.tensorboard import SummaryWriter -import pickle - -import math +import json +import sys +import pathlib import time -import copy - -class GamePlay: - """ - Class which run in a dedicated thread to play games and save them to the replay-buffer. - """ - - def __init__(self, model, initial_checkpoint, Game, config, seed): - self.config = config - self.game = Game(seed) - - # Fix random generator seed - numpy.random.seed(seed) - torch.manual_seed(seed) - - # Initialize the network - # self.model = models.MuZeroNetwork(self.config) - # self.model.set_weights(initial_checkpoint["weights"]) - self.model = model - self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) - self.model.eval() - self.trained_steps = initial_checkpoint["training_step"] - self.terminate = False - - #play game 运行 - # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 - # 运行步骤: - # 1. 创建GameHistory用来存储数据 - # 2. 检查游戏是否结束或者到底最大移动次数 - # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) - # 4. 运行MCTS搜索下一步的action - # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done - # 6. 持续运行2-5步直到结束 - # 7. 返回GameHistory - def play_game( - self, temperature, temperature_threshold, render, opponent, muzero_player - ): - """ - Play one game with actions based on the Monte Carlo tree search at each moves. - """ - game_history = GameHistory() - observation = self.game.reset() - game_history.action_history.append(0) - game_history.observation_history.append(observation) # 添加reset之后的observation - game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) - - done = False - game_id = None - - if render: - self.game.render() - - game_id = self.game.to_play() - - with torch.no_grad(): - while ( - not done and len(game_history.action_history) <= self.config.max_moves - ): # 游戏没有结束且运行步数小于最大移动步长 - assert ( - len(numpy.array(observation).shape) == 3 - ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" - assert ( - numpy.array(observation).shape == self.config.observation_shape - ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." - stacked_observations = game_history.get_stacked_observations( - -1, self.config.stacked_observations, len(self.config.action_space) - ) - # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 - # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 - - # 一下的if-else部分主要是为了选择一个动作 - # Choose the action - if opponent == "self" or muzero_player == self.game.to_play(): - root, mcts_info = MCTS(self.config).run( - self.model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 - True, - ) - action = self.select_action( - root, - temperature - if not temperature_threshold - or len(game_history.action_history) < temperature_threshold - else 0, - ) # 根据temperature选择动作 - - if render: - print(f'Tree depth: {mcts_info["max_tree_depth"]}') - print( - f"Root value for player {self.game.to_play()}: {root.value():.2f}" - ) - else: - action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 - opponent, stacked_observations - ) - - observation, reward, done = self.game.step(action) # 运行游戏 - - if render: - print(f"Played action: {self.game.action_to_string(action)}") - self.game.render() - - game_history.store_search_statistics(root, self.config.action_space) - - # Next batch - game_history.action_history.append(action) - game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 - game_history.reward_history.append(reward) - game_history.to_play_history.append(self.game.to_play()) - - return game_id, game_history - - def close_game(self): - self.game.close() - - def select_opponent_action(self, opponent, stacked_observations): - """ - Select opponent action for evaluating MuZero level. - """ - if opponent == "human": - root, mcts_info = MCTS(self.config).run( - self.model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), - True, - ) - print(f'Tree depth: {mcts_info["max_tree_depth"]}') - print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") - print( - f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" - ) - return self.game.human_to_action(), root - elif opponent == "expert": - return self.game.expert_agent(), None - elif opponent == "random": - assert ( - self.game.legal_actions() - ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." - assert set(self.game.legal_actions()).issubset( - set(self.config.action_space) - ), "Legal actions should be a subset of the action space." - - return numpy.random.choice(self.game.legal_actions()), None - else: - raise NotImplementedError( - 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' - ) - - # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 - # 公式为 c^(1/t)。可以看到: - # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 - # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 - # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 - @staticmethod # 静态方法修饰符,类似于static关键字 - def select_action(node, temperature): - """ - Select action according to the visit count distribution and the temperature. - The temperature is changed dynamically with the visit_softmax_temperature function - in the config. - """ - visit_counts = numpy.array( - [child.visit_count for child in node.children.values()], dtype="int32" - ) - actions = [action for action in node.children.keys()] - if temperature == 0: - action = actions[numpy.argmax(visit_counts)] - elif temperature == float("inf"): - action = numpy.random.choice(actions) - else: - # See paper appendix Data Generation - visit_count_distribution = visit_counts ** (1 / temperature) - visit_count_distribution = visit_count_distribution / sum( - visit_count_distribution - ) - action = numpy.random.choice(actions, p=visit_count_distribution) - - return action - -class PlayBuffer: - """ - Class which run in a dedicated thread to store played games and generate batch. - """ - - def __init__(self, initial_checkpoint, initial_buffer, config): - self.config = config - self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{} - self.num_played_games = initial_checkpoint["num_played_games"] - self.num_played_steps = initial_checkpoint["num_played_steps"] - self.total_samples = sum( - [len(game_history.root_values) for game_history in self.buffer.values()] - ) - if self.total_samples != 0: - print( - f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n" - ) - - # Fix random generator seed - numpy.random.seed(self.config.seed) - - def save_game(self, game_history): - self.buffer[self.num_played_games] = game_history - self.num_played_games += 1 - self.num_played_steps += len(game_history.root_values) - self.total_samples += len(game_history.root_values) - - if self.config.replay_buffer_size < len(self.buffer): - del_id = self.num_played_games - len(self.buffer) - self.total_samples -= len(self.buffer[del_id].root_values) - del self.buffer[del_id] +import nevergrad - def get_buffer(self): - return self.buffer - - def get_batch(self): - ( - index_batch, - observation_batch, - action_batch, - reward_batch, - value_batch, - policy_batch, - gradient_scale_batch, - ) = ([], [], [], [], [], [], []) - weight_batch = None - - for game_id, game_history, game_prob in self.sample_n_games( - self.config.batch_size - ): - game_pos, pos_prob = self.sample_position(game_history) - - values, rewards, policies, actions = self.make_target( - game_history, game_pos - ) - - index_batch.append([game_id, game_pos]) - observation_batch.append( - game_history.get_stacked_observations( - game_pos, - self.config.stacked_observations, - len(self.config.action_space), - ) - ) - action_batch.append(actions) - value_batch.append(values) - reward_batch.append(rewards) - policy_batch.append(policies) - gradient_scale_batch.append( - [ - min( - self.config.num_unroll_steps, - len(game_history.action_history) - game_pos, - ) - ] - * len(actions) - ) - - # observation_batch: batch, channels, height, width - # action_batch: batch, num_unroll_steps+1 - # value_batch: batch, num_unroll_steps+1 - # reward_batch: batch, num_unroll_steps+1 - # policy_batch: batch, num_unroll_steps+1, len(action_space) - # weight_batch: batch - # gradient_scale_batch: batch, num_unroll_steps+1 - return ( - index_batch, - ( - observation_batch, - action_batch, - value_batch, - reward_batch, - policy_batch, - weight_batch, - gradient_scale_batch, - ), - ) - - def sample_game(self, force_uniform=True): #将force_uniform 设置为True,强制安装平均分布选取 - """ - Sample game from buffer either uniformly or according to some priority. - See paper appendix Training. - """ - game_prob = None - - game_index = numpy.random.choice(len(self.buffer)) - game_id = self.num_played_games - len(self.buffer) + game_index - - return game_id, self.buffer[game_id], game_prob - - def sample_n_games(self, n_games): - selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) - game_prob_dict = {} - ret = [ - (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) - for game_id in selected_games +if __name__ == "__main__": + # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") + # start_time = time.time() + # muzero.train() + # end_time = time.time() + # print("耗时: {:.2f}秒".format(end_time - start_time)) + model_cls = models.MuZeroNetwork + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" ] - return ret - - def sample_position(self, game_history): - """ - Sample position from game either uniformly or according to some priority. - See paper appendix Training. - """ - position_prob = None - - position_index = numpy.random.choice(len(game_history.root_values)) - - return position_index, position_prob - - def update_game_history(self, game_id, game_history): - # The element could have been removed since its selection and update - # if next(iter(self.buffer)) <= game_id: - # self.buffer[game_id] = game_history - - self.buffer[game_id] = game_history - - def compute_target_value(self, game_history, index): - # The value target is the discounted root value of the search tree td_steps into the - # future, plus the discounted sum of all rewards until then. - bootstrap_index = index + self.config.td_steps - if bootstrap_index < len(game_history.root_values): - root_values = ( - game_history.root_values - if game_history.reanalysed_predicted_root_values is None - else game_history.reanalysed_predicted_root_values - ) - last_step_value = ( - root_values[bootstrap_index] - if game_history.to_play_history[bootstrap_index] - == game_history.to_play_history[index] - else -root_values[bootstrap_index] - ) - - value = last_step_value * self.config.discount**self.config.td_steps - else: - value = 0 - - for i, reward in enumerate( - game_history.reward_history[index + 1 : bootstrap_index + 1] - ): - # The value is oriented from the perspective of the current player - value += ( - reward - if game_history.to_play_history[index] - == game_history.to_play_history[index + i] - else -reward - ) * self.config.discount**i - - return value - - def make_target(self, game_history, state_index): - """ - Generate targets for every unroll steps. - """ - target_values, target_rewards, target_policies, actions = [], [], [], [] - for current_index in range( - state_index, state_index + self.config.num_unroll_steps + 1 - ): - value = self.compute_target_value(game_history, current_index) - - if current_index < len(game_history.root_values): - target_values.append(value) - target_rewards.append(game_history.reward_history[current_index]) - target_policies.append(game_history.child_visits[current_index]) - actions.append(game_history.action_history[current_index]) - elif current_index == len(game_history.root_values): - target_values.append(0) - target_rewards.append(game_history.reward_history[current_index]) - # Uniform policy - target_policies.append( - [ - 1 / len(game_history.child_visits[0]) - for _ in range(len(game_history.child_visits[0])) - ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZeroGeneral(game_name, model_cls=model_cls) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 ) - actions.append(game_history.action_history[current_index]) + muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) else: - # States past the end of games are treated as absorbing states - target_values.append(0) - target_rewards.append(0) - # Uniform policy - target_policies.append( - [ - 1 / len(game_history.child_visits[0]) - for _ in range(len(game_history.child_visits[0])) - ] - ) - actions.append(numpy.random.choice(self.config.action_space)) - - return target_values, target_rewards, target_policies, actions - -class Trainer: - """ - Class which run in a dedicated thread to train a neural network and save it - in the shared storage. - """ - - def __init__(self, initial_checkpoint, config): - self.config = config - - # Fix random generator seed - numpy.random.seed(self.config.seed) - torch.manual_seed(self.config.seed) - - # Initialize the network - self.model = models.MuZeroNetwork(self.config) - # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) - self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) - self.model.train() - - self.training_step = initial_checkpoint["training_step"] - - if "cuda" not in str(next(self.model.parameters()).device): - print("You are not training on GPU.\n") - - # Initialize the optimizer - if self.config.optimizer == "SGD": - self.optimizer = torch.optim.SGD( - self.model.parameters(), - lr=self.config.lr_init, - momentum=self.config.momentum, - weight_decay=self.config.weight_decay, - ) - elif self.config.optimizer == "Adam": - self.optimizer = torch.optim.Adam( - self.model.parameters(), - lr=self.config.lr_init, - weight_decay=self.config.weight_decay, - ) - else: - raise NotImplementedError( - f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." - ) - - # if initial_checkpoint["optimizer_state"] is not None: - # print("Loading optimizer...\n") - # self.optimizer.load_state_dict( - # copy.deepcopy(initial_checkpoint["optimizer_state"]) - # ) - - # # update weights 与 continuous update weights 的区别 - # # 1. update weights 是实际计算更新network的权重 - # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 - # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 - # next_batch = play_buffer.get_batch() - # # Training loop - # while self.training_step < self.config.training_steps and not terminate: - # index_batch, batch = next_batch - # next_batch = play_buffer.get_batch() - # self.update_lr() - # ( - # priorities, - # total_loss, - # value_loss, - # reward_loss, - # policy_loss, - # ) = self.update_weights(batch) - - def update_weights(self, batch): - """ - Perform one training step. - """ - - ( - observation_batch, - action_batch, - target_value, - target_reward, - target_policy, - weight_batch, - gradient_scale_batch, - ) = batch - - # Keep values as scalars for calculating the priorities for the prioritized replay - target_value_scalar = numpy.array(target_value, dtype="float32") - priorities = numpy.zeros_like(target_value_scalar) - - device = next(self.model.parameters()).device - observation_batch = ( - torch.tensor(numpy.array(observation_batch)).float().to(device) - ) - action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) - target_value = torch.tensor(target_value).float().to(device) - target_reward = torch.tensor(target_reward).float().to(device) - target_policy = torch.tensor(target_policy).float().to(device) - gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) - # observation_batch: batch, channels, height, width - # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) - # target_value: batch, num_unroll_steps+1 - # target_reward: batch, num_unroll_steps+1 - # target_policy: batch, num_unroll_steps+1, len(action_space) - # gradient_scale_batch: batch, num_unroll_steps+1 - - target_value = models.scalar_to_support(target_value, self.config.support_size) - target_reward = models.scalar_to_support( - target_reward, self.config.support_size - ) - # target_value: batch, num_unroll_steps+1, 2*support_size+1 - # target_reward: batch, num_unroll_steps+1, 2*support_size+1 - - ## Generate predictions - value, reward, policy_logits, hidden_state = self.model.initial_inference( - observation_batch - ) - predictions = [(value, reward, policy_logits)] - for i in range(1, action_batch.shape[1]): - value, reward, policy_logits, hidden_state = self.model.recurrent_inference( - hidden_state, action_batch[:, i] - ) - # Scale the gradient at the start of the dynamics function (See paper appendix Training) - hidden_state.register_hook(lambda grad: grad * 0.5) - predictions.append((value, reward, policy_logits)) - # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) - - ## Compute losses - value_loss, reward_loss, policy_loss = (0, 0, 0) - value, reward, policy_logits = predictions[0] - # Ignore reward loss for the first batch step - current_value_loss, _, current_policy_loss = self.loss_function( - value.squeeze(-1), - reward.squeeze(-1), - policy_logits, - target_value[:, 0], - target_reward[:, 0], - target_policy[:, 0], - ) - value_loss += current_value_loss - policy_loss += current_policy_loss - # Compute priorities for the prioritized replay (See paper appendix Training) - pred_value_scalar = ( - models.support_to_scalar(value, self.config.support_size) - .detach() - .cpu() - .numpy() - .squeeze() - ) - priorities[:, 0] = ( - numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) - ** self.config.PER_alpha - ) - - for i in range(1, len(predictions)): - value, reward, policy_logits = predictions[i] - ( - current_value_loss, - current_reward_loss, - current_policy_loss, - ) = self.loss_function( - value.squeeze(-1), - reward.squeeze(-1), - policy_logits, - target_value[:, i], - target_reward[:, i], - target_policy[:, i], - ) - - # Scale gradient by the number of unroll steps (See paper appendix Training) - current_value_loss.register_hook( - lambda grad: grad / gradient_scale_batch[:, i] - ) - current_reward_loss.register_hook( - lambda grad: grad / gradient_scale_batch[:, i] - ) - current_policy_loss.register_hook( - lambda grad: grad / gradient_scale_batch[:, i] - ) - - value_loss += current_value_loss - reward_loss += current_reward_loss - policy_loss += current_policy_loss - - # Compute priorities for the prioritized replay (See paper appendix Training) - pred_value_scalar = ( - models.support_to_scalar(value, self.config.support_size) - .detach() - .cpu() - .numpy() - .squeeze() - ) - priorities[:, i] = ( - numpy.abs(pred_value_scalar - target_value_scalar[:, i]) - ** self.config.PER_alpha - ) - - # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) - loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss - - # Mean over batch dimension (pseudocode do a sum) - loss = loss.mean() - - # Optimize - self.optimizer.zero_grad() - loss.backward() - self.optimizer.step() - self.training_step += 1 - - return ( - priorities, - # For log purpose - loss.item(), - value_loss.mean().item(), - reward_loss.mean().item(), - policy_loss.mean().item(), - ) - - def update_lr(self): - """ - Update learning rate - """ - lr = self.config.lr_init * self.config.lr_decay_rate ** ( - self.training_step / self.config.lr_decay_steps - ) - for param_group in self.optimizer.param_groups: - param_group["lr"] = lr - - @staticmethod - def loss_function( - value, - reward, - policy_logits, - target_value, - target_reward, - target_policy, - ): - # Cross-entropy seems to have a better convergence than MSE - value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) - reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) - policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1) - - return value_loss, reward_loss, policy_loss - - -def logging_loop(config, checkpoint, writer, training_steps): - # writer = SummaryWriter(config.results_path) - - # print( - # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" - # ) - - # Save hyperparameters to TensorBoard - hp_table = [ - f"| {key} | {value} |" for key, value in config.__dict__.items() - ] - writer.add_text( - "Hyperparameters", - "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), - ) - # # Save model representation - # writer.add_text( - # "Model summary", - # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 - # ) - # Loop for updating the training performance - counter = training_steps - - try: - if True: - # while checkpoint["training_step"] < config.training_steps: - writer.add_scalar( - "1.Total_reward/1.Total_reward", - checkpoint["total_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/2.Mean_value", - checkpoint["mean_value"], - counter, - ) - writer.add_scalar( - "1.Total_reward/3.Episode_length", - checkpoint["episode_length"], - counter, - ) - writer.add_scalar( - "1.Total_reward/4.MuZero_reward", - checkpoint["muzero_reward"], - counter, - ) - writer.add_scalar( - "1.Total_reward/5.Opponent_reward", - checkpoint["opponent_reward"], - counter, - ) - writer.add_scalar( - "2.Workers/1.Self_played_games", - checkpoint["num_played_games"], - counter, - ) - writer.add_scalar( - "2.Workers/2.Training_steps", checkpoint["training_step"], counter - ) - writer.add_scalar( - "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter - ) - writer.add_scalar( - "2.Workers/4.Reanalysed_games", - checkpoint["num_reanalysed_games"], - counter, - ) - writer.add_scalar( - "2.Workers/5.Training_steps_per_self_played_step_ratio", - checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]), - counter, - ) - writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter) - writer.add_scalar( - "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter - ) - writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter) - writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter) - writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter) - print( - f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}', - end="\r", - ) - counter += 1 - # time.sleep(0.5) - except KeyboardInterrupt: - pass - - # if config.save_model: - # # Persist replay buffer to disk - # path = config.results_path / "replay_buffer.pkl" - # print(f"\n\nPersisting replay buffer games to disk at {path}") - # pickle.dump( - # { - # "buffer": buffer, - # "num_played_games": checkpoint["num_played_games"], - # "num_played_steps": checkpoint["num_played_steps"], - # "num_reanalysed_games": checkpoint["num_reanalysed_games"], - # }, - # open(path, "wb"), - # ) - -def update_gameplay_checkpoint(config, checkpoint, game_history): - checkpoint["episode_length"] = len(game_history.action_history) - 1 - checkpoint["total_reward"] = sum(game_history.reward_history) - checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) - - if 1 < len(config.players): - checkpoint["muzero_reward"] = sum( - reward - for i, reward in enumerate(game_history.reward_history) - if game_history.to_play_history[i - 1] - == config.muzero_player - ) - checkpoint["opponent_reward"] = sum( - reward - for i, reward in enumerate(game_history.reward_history) - if game_history.to_play_history[i - 1] - != config.muzero_player - ) - -def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中 - if not path: - path = config.results_path / "model.checkpoint" - - torch.save(checkpoint, path) - -def train(log_in_tensorboard=True): - config = MuZeroConfig() - config.results_path /= "muzero_without_rb" - - if log_in_tensorboard or config.save_model: - config.results_path.mkdir(parents=True, exist_ok=True) - - checkpoint = { - "weights": None, - "optimizer_state": None, - "total_reward": 0, - "muzero_reward": 0, - "opponent_reward": 0, - "episode_length": 0, - "mean_value": 0, - "training_step": 0, - "lr": 0, - "total_loss": 0, - "value_loss": 0, - "reward_loss": 0, - "policy_loss": 0, - "num_played_games": 0, - "num_played_steps": 0, - "num_reanalysed_games": 0, - "terminate": False, - } - - trainer = Trainer(checkpoint, config) - selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed) - buffer = {} - play_buffer = PlayBuffer(checkpoint, buffer, config) - - step = 1 # 间隔,即每次模拟后训练多少次 - max_steps = int(config.training_steps/step) - - writer = SummaryWriter(config.results_path) - - for episode in range(max_steps): - game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0) - - # print(game_id) - # print(game_history.action_history) - # print(game_history.reward_history) - # print(game_history.to_play_history) - # # print(game_history.observation_history) - # print("child visits", game_history.child_visits) - # print(game_history.root_values) # root value指的是root节点的UCB值 - - play_buffer.update_game_history(game_id, game_history) - update_gameplay_checkpoint(config, checkpoint, game_history) - - for i in range(step): - index_batch, batch = play_buffer.get_batch() - # print(batch[1]) - trainer.update_lr() - ( - priorities, - total_loss, - value_loss, - reward_loss, - policy_loss, - ) = trainer.update_weights(batch) - - - training_step = episode * step + i - if training_step % config.checkpoint_interval == 0: - checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) - checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) - - if config.save_model: - save_checkpoint(config, checkpoint) - checkpoint["training_step"] = training_step - checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] - checkpoint["total_loss"] = total_loss - checkpoint["value_loss"] = value_loss - checkpoint["reward_loss"] = reward_loss - checkpoint["policy_loss"] = policy_loss - - # print(training_step) - # if training_step % 500 == 0: - # if training_step % config.checkpoint_interval == 0: - # # print(training_step) - # logging_loop(config, checkpoint, writer) - - logging_loop(config, checkpoint, writer, training_step) - - - writer.close() - - selfplay.close_game() - -if __name__ == "__main__": - start_time = time.time() - train() - end_time = time.time() - print("耗时: {:.2f}秒".format(end_time - start_time)) \ No newline at end of file + break + print("\nDone") \ No newline at end of file diff --git a/muzero_without_replay_buffer2.py b/muzero_without_replay_buffer2.py deleted file mode 100644 index 4b87fc7b..00000000 --- a/muzero_without_replay_buffer2.py +++ /dev/null @@ -1,108 +0,0 @@ -import models -from muzero_general import MuZeroGeneral -from muzero import load_model_menu, hyperparameter_search - -import json -import sys -import pathlib -import time -import nevergrad - -if __name__ == "__main__": - # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") - # start_time = time.time() - # muzero.train() - # end_time = time.time() - # print("耗时: {:.2f}秒".format(end_time - start_time)) - model_cls = models.MuZeroNetwork - if len(sys.argv) == 2: - # Train directly with: python muzero.py cartpole - muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) - muzero.train() - elif len(sys.argv) == 3: - # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' - config = json.loads(sys.argv[2]) - muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) - muzero.train() - else: - print("\nWelcome to MuZero! Here's a list of games:") - # Let user pick a game - games = [ - filename.stem - for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) - if filename.name != "abstract_game.py" - ] - for i in range(len(games)): - print(f"{i}. {games[i]}") - choice = input("Enter a number to choose the game: ") - valid_inputs = [str(i) for i in range(len(games))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - - # Initialize MuZero - choice = int(choice) - game_name = games[choice] - muzero = MuZeroGeneral(game_name, model_cls=model_cls) - - while True: - # Configure running options - options = [ - "Train", - "Load pretrained model", - "Diagnose model", - "Render some self play games", - "Play against MuZero", - "Test the game manually", - "Hyperparameter search", - "Exit", - ] - print() - for i in range(len(options)): - print(f"{i}. {options[i]}") - - choice = input("Enter a number to choose an action: ") - valid_inputs = [str(i) for i in range(len(options))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - choice = int(choice) - if choice == 0: - start_time = time.time() - muzero.train() - end_time = time.time() - print("耗时: {:.2f}秒".format(end_time - start_time)) - elif choice == 1: - load_model_menu(muzero, game_name) - elif choice == 2: - muzero.diagnose_model(30) - elif choice == 3: - muzero.test(render=True, opponent="self", muzero_player=None) - elif choice == 4: - muzero.test(render=True, opponent="human", muzero_player=0) - elif choice == 5: - env = muzero.Game() - env.reset() - env.render() - - done = False - while not done: - action = env.human_to_action() - observation, reward, done = env.step(action) - print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") - env.render() - elif choice == 6: - # Define here the parameters to tune - # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html - muzero.terminate_workers() - del muzero - budget = 20 - parallel_experiments = 2 - lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) - discount = nevergrad.p.Log(lower=0.95, upper=0.9999) - parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) - best_hyperparameters = hyperparameter_search( - game_name, parametrization, budget, parallel_experiments, 20 - ) - muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) - else: - break - print("\nDone") \ No newline at end of file diff --git a/simplifiedMuZero/net2/__init__.py b/simplifiedMuZero/net2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/simplifiedMuZero/models2.py b/simplifiedMuZero/net2/models2.py similarity index 98% rename from simplifiedMuZero/models2.py rename to simplifiedMuZero/net2/models2.py index fd6aa6ee..c36e8095 100644 --- a/simplifiedMuZero/models2.py +++ b/simplifiedMuZero/net2/models2.py @@ -7,7 +7,6 @@ class MuZeroNetwork_2net: def __new__(cls, config): - print("MuZeroNetwork_2net") if config.network == "fullyconnected": return MuZeroFullyConnectedNetwork_2net( config.observation_shape, @@ -57,6 +56,7 @@ def __init__( support_size, ): super().__init__() + print(self.__class__.__name__) self.action_space_size = action_space_size self.full_support_size = 2 * support_size + 1 # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数 @@ -100,6 +100,7 @@ def __init__( mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] ) + def prediction(self, encoded_state): policy_logits = self.prediction_policy_network(encoded_state) value = self.prediction_value_network(encoded_state) @@ -128,10 +129,11 @@ def representation(self, observation): return self.encoded_stated_normalized(encoded_state) + # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入,而representation不需要绑定action def dynamics(self, encoded_state, action): action_one_hot = (torch.zeros((action.shape[0], self.action_space_size)).to(action.device).float()) - action_one_hot.scatter(1, action.long(), 1.0) + action_one_hot.scatter_(1, action.long(), 1.0) x = torch.cat((encoded_state, action_one_hot), dim=1) next_encoded_state = self.dynamics_encoded_state_network(x) @@ -185,9 +187,6 @@ def __init__( downsample, ): super().__init__() - print("observation shape is ", observation_shape) - print("num channels is ", num_channels) - num_channels = observation_shape[1] self.action_space_size = action_space_size self.full_support_size = 2 * support_size + 1 diff --git a/simplifiedMuZero/net2/replay_buffer_2net.py b/simplifiedMuZero/net2/replay_buffer_2net.py index 55522b86..646611c1 100644 --- a/simplifiedMuZero/net2/replay_buffer_2net.py +++ b/simplifiedMuZero/net2/replay_buffer_2net.py @@ -5,7 +5,9 @@ import ray import torch -import simplifiedMuZero.net2.models_2net as models +# import simplifiedMuZero.net2.models_2net as models +import models +from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net @ray.remote @@ -318,7 +320,8 @@ def __init__(self, initial_checkpoint, config): torch.manual_seed(self.config.seed) # Initialize the network - self.model = models.SimplifiedMuZeroNetwork(self.config) + # self.model = models.SimplifiedMuZeroNetwork(self.config) + self.model = MuZeroNetwork_2net(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to(torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu")) self.model.eval() diff --git a/simplifiedMuZero/net2/self_play_2net.py b/simplifiedMuZero/net2/self_play_2net.py index a0a208a8..5ca7bfbd 100644 --- a/simplifiedMuZero/net2/self_play_2net.py +++ b/simplifiedMuZero/net2/self_play_2net.py @@ -5,7 +5,9 @@ import ray import torch -import simplifiedMuZero.net2.models_2net as models +# import simplifiedMuZero.net2.models_2net as models +import models +from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net @ray.remote @@ -23,7 +25,7 @@ def __init__(self, initial_checkpoint, Game, config, seed): torch.manual_seed(seed) # Initialize the network - self.model = models.SimplifiedMuZeroNetwork(self.config) + self.model = MuZeroNetwork_2net(self.config) # self.model = models.MuZeroNetwork(self.config) self.model.set_weights(initial_checkpoint["weights"]) self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) diff --git a/simplifiedMuZero/net2/trainer_2net.py b/simplifiedMuZero/net2/trainer_2net.py index 567b8f9a..d11612bd 100644 --- a/simplifiedMuZero/net2/trainer_2net.py +++ b/simplifiedMuZero/net2/trainer_2net.py @@ -5,7 +5,9 @@ import ray import torch -import simplifiedMuZero.net2.models_2net as models +# import simplifiedMuZero.net2.models_2net as models +import models +from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net @ray.remote @@ -23,7 +25,7 @@ def __init__(self, initial_checkpoint, config): torch.manual_seed(self.config.seed) # Initialize the network - self.model = models.SimplifiedMuZeroNetwork(self.config) + self.model = MuZeroNetwork_2net(self.config) self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) self.model.train() diff --git a/simplifiedMuZero/no_pv/trainer_no_pv.py b/simplifiedMuZero/no_pv/trainer_no_pv.py new file mode 100644 index 00000000..e4a6080c --- /dev/null +++ b/simplifiedMuZero/no_pv/trainer_no_pv.py @@ -0,0 +1,301 @@ +import copy +import time + +import numpy +import ray +import torch + +import models + + +@ray.remote +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + if initial_checkpoint["optimizer_state"] is not None: + print("Loading optimizer...\n") + self.optimizer.load_state_dict( + copy.deepcopy(initial_checkpoint["optimizer_state"]) + ) + + def continuous_update_weights(self, replay_buffer, shared_storage): + # Wait for the replay buffer to be filled + while ray.get(shared_storage.get_info.remote("num_played_games")) < 1: + time.sleep(0.1) + + next_batch = replay_buffer.get_batch.remote() + # Training loop + while self.training_step < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + ): + index_batch, batch = ray.get(next_batch) + next_batch = replay_buffer.get_batch.remote() + self.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = self.update_weights(batch) + + if self.config.PER: + # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933) + replay_buffer.update_priorities.remote(priorities, index_batch) + + # Save to the shared storage + if self.training_step % self.config.checkpoint_interval == 0: + shared_storage.set_info.remote( + { + "weights": copy.deepcopy(self.model.get_weights()), + "optimizer_state": copy.deepcopy( + models.dict_to_cpu(self.optimizer.state_dict()) + ), + } + ) + if self.config.save_model: + shared_storage.save_checkpoint.remote() + shared_storage.set_info.remote( + { + "training_step": self.training_step, + "lr": self.optimizer.param_groups[0]["lr"], + "total_loss": total_loss, + "value_loss": value_loss, + "reward_loss": reward_loss, + "policy_loss": policy_loss, + } + ) + + # Managing the self-play / training ratio + if self.config.training_delay: + time.sleep(self.config.training_delay) + if self.config.ratio: + while ( + self.training_step + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + > self.config.ratio + and self.training_step < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + ): + time.sleep(0.5) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + if self.config.PER: + weight_batch = torch.tensor(weight_batch.copy()).float().to(device) + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + # loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + if self.config.PER: + # Correct PER bias by using importance-sampling (IS) weights + loss *= weight_batch + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: # 更新optimizer的lr + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss diff --git a/simplifiedMuZero/search_policy/RHEA.py b/simplifiedMuZero/search_policy/RHEA.py index d23c611b..fe070c8b 100644 --- a/simplifiedMuZero/search_policy/RHEA.py +++ b/simplifiedMuZero/search_policy/RHEA.py @@ -1,12 +1,75 @@ +import copy +import numpy as np +from functools import partial + +from deap import base, creator, tools, algorithms + +from games.abstract_game import AbstractGame + +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) + +class RHEA: + def __init__(self): + self.game = None + self.play_id = 0 + self.toolbox = base.Toolbox() + self.register("mate", tools.cxTwoPoint) + self.register("mutate", tools.mutFlipBit, indpb=0.05) + self.register("select", tools.selStochasticUniversalSampling) + + def game_evaluate(self, actions, game_stat=None, play_id=None): + game_stat = copy.deepcopy(game_stat) + game_stat.reset() + + for i in range(len(actions)): + player = game_stat.to_play() + observation, reward, done = game_stat.step(actions[i]) + if done: + break + + game_stat.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + def evaluate(self, actions): + game_stat = copy.deepcopy(self.game) + play_id = self.play_id + + game_stat.reset() + + for i in range(len(actions)): + player = game_stat.to_play() + observation, reward, done = game_stat.step(actions[i]) + if done: + break + + game_stat.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + def individual(self, actions, max_moves, replace=False): + max_moves = max_moves if replace else len(actions) + return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace)) + def population(self, actions, max_moves, N, replace=False): + return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N) + + def rhea(self, game_state:AbstractGame, config, play_id): + actions = game_state.legal_actions() + pop = self.population(actions. config.max_moves) + self.toolbox.register("evaluate", self.game_evaluate, game=game_state, play_id=play_id) + pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) + + results = tools.selBest(pop, k=1) + + # 返回第一个动作和评分 + return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作 + + + -class RHEAIndividual: - def __init__(self, L:int, discount_factor:double, forword_model, state, play_id:int, - seed, heuristic): - self.state = state - self.L = L - self.discount_factor = discount_factor - self.forword_model = forword_model - self.play_id = play_id - self.seed = seed - self.heuristic = heuristic \ No newline at end of file diff --git a/simplifiedMuZero/search_policy/RHEA2.py b/simplifiedMuZero/search_policy/RHEA2.py new file mode 100644 index 00000000..73d30799 --- /dev/null +++ b/simplifiedMuZero/search_policy/RHEA2.py @@ -0,0 +1,192 @@ +import copy +import numpy as np +from functools import partial +import torch + +from deap import base, creator, tools, algorithms + +from games.abstract_game import AbstractGame +from self_play import Node +import models + +from games.tictactoe import MuZeroConfig, Game + +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) + + +def evaluate(actions, model, observation, config): + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + + for action in actions: + value, reward, policy_logits, hidden_state = model.recurrent_inference( + hidden_state, + torch.tensor([[action]]).to(observation.device), + ) + + reward = models.support_to_scalar(reward, config.support_size).item() + return reward, + +class RHEA: + def __init__(self, config, game): + self.game = game + self.config = config + self.play_id = -1 + self.toolbox = base.Toolbox() + self.toolbox.register("mate", tools.cxTwoPoint) + self.toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) + self.toolbox.register("select", tools.selStochasticUniversalSampling) + + # def game_evaluate(self, actions, game_stat=None, play_id=None): + # game_stat = copy.deepcopy(game_stat) + # game_stat.reset() + # + # for i in range(len(actions)): + # player = game_stat.to_play() + # observation, reward, done = game_stat.step(actions[i]) + # if done: + # break + # + # game_stat.close() + # reward = reward if play_id == player else -reward + # # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + # reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + # return reward, + # + # def action_evaluate(self, actions): + # game_stat = copy.deepcopy(self.game) + # game_stat.reset() + # + # for i in range(len(actions)): + # player = game_stat.to_play() + # observation, reward, done = game_stat.step(actions[i]) + # if done: + # break + # + # game_stat.close() + # reward = reward if self.play_id == player else -reward + # + # return reward, actions[:(i+1)] + # + def evaluate(self, actions): + game_stat = copy.deepcopy(self.game) + play_id = self.play_id + + game_stat.reset() + + for i in range(len(actions)): + player = game_stat.to_play() + observation, reward, done = game_stat.step(actions[i]) + if done: + break + + game_stat.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + def individual(self, actions, max_moves, replace=False): + max_moves = max_moves if replace else min(len(actions), max_moves) + return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace)) + def population(self, actions, max_moves, N, replace=False): + return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N) + + # def rhea(self, game_state:AbstractGame): + # self.game = game_state + # self.play_id = game_state.to_play() + # actions = game_state.legal_actions() + # self.toolbox.register("evaluate", evaluate, ) + # pop = self.population(actions. self.config.max_moves) + # + # pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) + # + # results = tools.selBest(pop, k=1) + # + # return self.action_evaluate(results[0]) + + + + # # 返回第一个动作和评分 + # return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作 + + def run(self, + model, + observation, + legal_actions, + to_play, + action_replace, + override_root_with=None, + ): + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) + + # 检查可用的动作空间,如果小于等于1,则直接返回。因为进化算法无法杂交,会报错 + if len(legal_actions) <=1: + return legal_actions + else: + # self.toolbox.register("evaluate", evaluate, model=model, observation=observation, config=self.config) + self.toolbox.register("evaluate", self.evaluate) + pop = self.population(legal_actions, self.config.max_moves, self.config.num_simulations, replace=action_replace) + + pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False) + + results = tools.selBest(pop, k=1) + + return results[0] + +if __name__=="__main__": + game = Game() + config = MuZeroConfig() + game.reset() + done = False + + # rhea = RHEA(config, game) + # pop = rhea.population(game.legal_actions(), 9, config.num_simulations, config.action_replace) + # + # print(pop) + # rhea.toolbox.register("evaluate", rhea.evaluate) + # pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0, ngen=9, verbose=False) + # + # results = tools.selBest(pop, k=1) + # print(results) + + legal_actions = game.legal_actions() + while not done and len(legal_actions) >1: + legal_actions = game.legal_actions() + rhea = RHEA(config, game) + rhea.play_id = game.to_play() + + pop = rhea.population(legal_actions, config.max_moves, config.num_simulations, config.action_replace) + + rhea.toolbox.register("evaluate", rhea.evaluate) + + pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False) + + print(pop) + results = tools.selBest(pop, k=1) + print(results) + action = results[0][0] + observation, reward, done = game.step(action) + # print(observation) + + + + + + + + + + + + diff --git a/simplifiedMuZero/search_policy/rhea_self_play.py b/simplifiedMuZero/search_policy/rhea_self_play.py new file mode 100644 index 00000000..ca49d875 --- /dev/null +++ b/simplifiedMuZero/search_policy/rhea_self_play.py @@ -0,0 +1,227 @@ +import math +import time + +import numpy +import ray +import torch + +import models +from simplifiedMuZero.search_policy.RHEA2 import RHEA +from self_play import GameHistory + + +@ray.remote +class SelfPlayRhea: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + # self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(initial_checkpoint["weights"]) + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + + def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): + while ray.get( + shared_storage.get_info.remote("training_step") + ) < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") + ): # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 + + if not test_mode: + game_history = self.play_game( + self.config.visit_softmax_temperature_fn( + trained_steps=ray.get( + shared_storage.get_info.remote("training_step") + ) + ), + self.config.temperature_threshold, + False, + "self", + 0, + ) + + replay_buffer.save_game.remote(game_history, shared_storage) + + else: + # Take the best action (no exploration) in test mode + game_history = self.play_game( + 0, + self.config.temperature_threshold, + False, + "self" if len(self.config.players) == 1 else self.config.opponent, + self.config.muzero_player, + ) + + # Save to the shared storage + shared_storage.set_info.remote( + { + "episode_length": len(game_history.action_history) - 1, + "total_reward": sum(game_history.reward_history), + "mean_value": numpy.mean( + [value for value in game_history.root_values if value] + ), + } + ) + if 1 < len(self.config.players): + shared_storage.set_info.remote( + { + "muzero_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ), + "opponent_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ), + } + ) + + # Managing the self-play / training ratio + if not test_mode and self.config.self_play_delay: + time.sleep(self.config.self_play_delay) + if not test_mode and self.config.ratio: + while ( + ray.get(shared_storage.get_info.remote("training_step")) + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + < self.config.ratio + and ray.get(shared_storage.get_info.remote("training_step")) + < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + self.close_game() + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + if render: + self.game.render() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + # root, mcts_info = MCTS(self.config).run( + # self.model, + # stacked_observations, + # self.game.legal_actions(), + # self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + # True, + # ) + # action = self.select_action( + # root, + # temperature + # if not temperature_threshold + # or len(game_history.action_history) < temperature_threshold + # else 0, + # ) # 根据temperature选择动作 + actions = RHEA(self.config, self.game).run(self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + self.config.action_replace, + ) + action = actions[0] + + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + # game_history.store_search_statistics(root, self.config.action_space) + game_history.root_values.append(reward) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + return self.game.human_to_action(), None + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) diff --git a/simplified_muzero.py b/simplified_muzero.py index cd99153e..11cf7591 100644 --- a/simplified_muzero.py +++ b/simplified_muzero.py @@ -1,4 +1,4 @@ -from simplifiedMuZero.net2.models_2net import SimplifiedMuZeroNetwork +from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net from muzero_general import MuZeroGeneral from muzero import load_model_menu, hyperparameter_search @@ -14,7 +14,7 @@ # muzero.train() # end_time = time.time() # print("耗时: {:.2f}秒".format(end_time - start_time)) - model_cls = SimplifiedMuZeroNetwork + model_cls = MuZeroNetwork_2net if len(sys.argv) == 2: # Train directly with: python muzero.py cartpole muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) diff --git a/simplified_muzero2.py b/simplified_muzero2.py deleted file mode 100644 index a136dd44..00000000 --- a/simplified_muzero2.py +++ /dev/null @@ -1,108 +0,0 @@ -from simplifiedMuZero.models2 import MuZeroNetwork_2net -from muzero_general import MuZeroGeneral -from muzero import load_model_menu, hyperparameter_search - -import json -import sys -import pathlib -import time -import nevergrad - -if __name__ == "__main__": - # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") - # start_time = time.time() - # muzero.train() - # end_time = time.time() - # print("耗时: {:.2f}秒".format(end_time - start_time)) - model_cls = MuZeroNetwork_2net - if len(sys.argv) == 2: - # Train directly with: python muzero.py cartpole - muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) - muzero.train() - elif len(sys.argv) == 3: - # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' - config = json.loads(sys.argv[2]) - muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) - muzero.train() - else: - print("\nWelcome to MuZero! Here's a list of games:") - # Let user pick a game - games = [ - filename.stem - for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) - if filename.name != "abstract_game.py" - ] - for i in range(len(games)): - print(f"{i}. {games[i]}") - choice = input("Enter a number to choose the game: ") - valid_inputs = [str(i) for i in range(len(games))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - - # Initialize MuZero - choice = int(choice) - game_name = games[choice] - muzero = MuZeroGeneral(game_name, model_cls=model_cls) - - while True: - # Configure running options - options = [ - "Train", - "Load pretrained model", - "Diagnose model", - "Render some self play games", - "Play against MuZero", - "Test the game manually", - "Hyperparameter search", - "Exit", - ] - print() - for i in range(len(options)): - print(f"{i}. {options[i]}") - - choice = input("Enter a number to choose an action: ") - valid_inputs = [str(i) for i in range(len(options))] - while choice not in valid_inputs: - choice = input("Invalid input, enter a number listed above: ") - choice = int(choice) - if choice == 0: - start_time = time.time() - muzero.train() - end_time = time.time() - print("耗时: {:.2f}秒".format(end_time - start_time)) - elif choice == 1: - load_model_menu(muzero, game_name) - elif choice == 2: - muzero.diagnose_model(30) - elif choice == 3: - muzero.test(render=True, opponent="self", muzero_player=None) - elif choice == 4: - muzero.test(render=True, opponent="human", muzero_player=0) - elif choice == 5: - env = muzero.Game() - env.reset() - env.render() - - done = False - while not done: - action = env.human_to_action() - observation, reward, done = env.step(action) - print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") - env.render() - elif choice == 6: - # Define here the parameters to tune - # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html - muzero.terminate_workers() - del muzero - budget = 20 - parallel_experiments = 2 - lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) - discount = nevergrad.p.Log(lower=0.95, upper=0.9999) - parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) - best_hyperparameters = hyperparameter_search( - game_name, parametrization, budget, parallel_experiments, 20 - ) - muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) - else: - break - print("\nDone") \ No newline at end of file diff --git a/test/deap_test.py b/test/deap_test.py index 0ec02e8e..51b930c8 100644 --- a/test/deap_test.py +++ b/test/deap_test.py @@ -1,3 +1,4 @@ +import copy import random import deap @@ -5,40 +6,115 @@ import numpy as np config = MuZeroConfig() -print(config.max_moves) from deap import base, creator, tools import numpy as np # 定义问题 -creator.create('FitnessMax', base.Fitness, weights=(-1.0,)) #优化目标:单变量,求最小值 -creator.create('Individual', list, fitness = creator.FitnessMax) #创建Individual类,继承list +# creator创建的是类,第一个参数是类名,第二个参数是基类,后面的是其它参数 +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) legal_actions = 9 toolbox = base.Toolbox() -toolbox.register("Indices", random.sample, range(legal_actions), legal_actions) -toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Indices) +# 注册生成基因的函数。第一个参数是函数名,因此下面的调用是toolbox.Actions。 +# 第二鸽参数是生成action的函数。 +# 后边的参数是生成函数的参数,如此为np.random.choice(range(n), N, replace=False) +toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False) +# tools.initIterate返回一个生成的动作序列 +toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions) -ind1 = toolbox.Individual() -print(ind1) +# ind1 = toolbox.Individual() +# print(ind1) +# 重复生成动作序列 toolbox.register("population", tools.initRepeat, list, toolbox.Individual) -pop = toolbox.population(n=36) -print(len(pop)) +# pop = toolbox.population(n=36) +# print(len(pop)) -def ea(game): - pass -# game = Game(0) -# game.reset() -# -# for i in range(9): -# game.render() + +game = Game(0) +game2 = copy.deepcopy(game) +game.reset() +game2.reset() + +actions = game.legal_actions() +np.random.shuffle(actions) + +# for i in range(config.max_moves): +# # game.render() # print(game.legal_actions()) # observation, reward, done = game.step(np.random.choice(game.legal_actions())) # # if done: # break + +def evaluate(actions): + game = Game(1) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if 0 == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + +def game_evaluate(actions, game=None, play_id=None): + game = copy.deepcopy(game) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i+1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + # print(actions[i]) + # game.render() + +toolbox.register("evaluate", game_evaluate, game=game, play_id = 0) +# toolbox.register("evaluate", evaluate) +toolbox.register("mate", tools.cxTwoPoint) +toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) +# toolbox.register("select", tools.selTournament, tournsize=2000) +# toolbox.register("select", tools.selBest) +toolbox.register("select", tools.selStochasticUniversalSampling) + +pop = toolbox.population(n=100) + +# from deap import algorithms +# pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) +# # print(logbook) +# result = tools.selBest(pop, k=1) + +results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]] +print(results) +print(evaluate(results[0])) +reward = game_evaluate(results[0],game,0) +print(reward) + +# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0) +# print(reward) # +# for i in range(20): +# print(game_evaluate(pop[i], game, 0)) + +# print(evaluate(actions, game, 0)) + +# print(actions[:i]) # game.render() +# game2.render() diff --git a/test/deap_test2.py b/test/deap_test2.py new file mode 100644 index 00000000..ad6de6bc --- /dev/null +++ b/test/deap_test2.py @@ -0,0 +1,119 @@ +import copy +import random + +import deap +from games.tictactoe import Game, MuZeroConfig +import numpy as np +from functools import partial + +config = MuZeroConfig() + +from deap import base, creator, tools +import numpy as np +# 定义问题 +# creator创建的是类,第一个参数是类名,第二个参数是基类,后面的是其它参数 +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) + +legal_actions = 9 + +toolbox = base.Toolbox() +# 注册生成基因的函数。第一个参数是函数名,因此下面的调用是toolbox.Actions。 +# 第二鸽参数是生成action的函数。 +# 后边的参数是生成函数的参数,如此为np.random.choice(range(n), N, replace=False) +# toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False) +# # tools.initIterate返回一个生成的动作序列 +# toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions) + +def individual(actions, max_moves, replace=False): + max_moves = max_moves if replace else len(actions) + return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace)) + +# print(individual([0,1,2,3,4], 9, replace=False)) +# print(individual([0,1,2,3,4], 9, replace=True)) +# exit() + +def population(actions, max_moves, N, replace=False): + return tools.initRepeat(list, partial(individual, actions, max_moves, replace), N) + +pop = population(range(9),9, N=4, replace=False) +print(pop) + +# exit() +# +# # 重复生成动作序列 +# toolbox.register("population", tools.initRepeat, list, toolbox.Individual) + +game = Game(0) + +actions = game.legal_actions() +np.random.shuffle(actions) + +def evaluate(actions): + game = Game(1) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if 0 == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + +def game_evaluate(actions, game=None, play_id=None): + game = copy.deepcopy(game) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i+1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + # print(actions[i]) + # game.render() + +toolbox.register("evaluate", game_evaluate, game=game, play_id = 0) +# toolbox.register("evaluate", evaluate) +toolbox.register("mate", tools.cxTwoPoint) +toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) +# toolbox.register("select", tools.selTournament, tournsize=2000) +# toolbox.register("select", tools.selBest) +toolbox.register("select", tools.selStochasticUniversalSampling) + +# pop = toolbox.population(n=100) +# pop = [[0, 6, 8, 7, 4, 5, 2, 1, 3], [0, 6, 3, 7, 4, 5, 2, 1, 8]] + +from deap import algorithms +pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) +# # print(logbook) +results = tools.selBest(pop, k=1) + +# results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]] +print(results) +print(evaluate(results[0])) +reward = game_evaluate(results[0],game,0) +print(reward) + +# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0) +# print(reward) +# +# for i in range(20): +# print(game_evaluate(pop[i], game, 0)) + +# print(evaluate(actions, game, 0)) + +# print(actions[:i]) +# game.render() +# game2.render() diff --git a/test/load_model.py b/test/load_model.py new file mode 100644 index 00000000..88e83520 --- /dev/null +++ b/test/load_model.py @@ -0,0 +1,12 @@ +import torch + +import simplifiedMuZero.net2.models2 as models +from games.tictactoe import Game, MuZeroConfig + +from game_tournament import load_model + +config = MuZeroConfig() + +muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" +muzero_2net_model = load_model(models.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config) + From 50c2c013a1a0731d50b96a94473686d760aeae21 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Wed, 23 Aug 2023 22:45:30 +0100 Subject: [PATCH 7/9] Replace fully connected network replace resnet in Tic-tac-toe --- game_tournament.py | 3 +- game_tournament2.py | 389 +++++++++++++++++++++++++++++++++++++++++++ game_tournament3.py | 390 ++++++++++++++++++++++++++++++++++++++++++++ games/tictactoe2.py | 361 ++++++++++++++++++++++++++++++++++++++++ games/tictactoe3.py | 354 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1496 insertions(+), 1 deletion(-) create mode 100644 game_tournament2.py create mode 100644 game_tournament3.py create mode 100644 games/tictactoe2.py create mode 100644 games/tictactoe3.py diff --git a/game_tournament.py b/game_tournament.py index 9e8499e5..8c87e7ef 100644 --- a/game_tournament.py +++ b/game_tournament.py @@ -345,7 +345,8 @@ def load_model(model_cls, model_path, config): # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) - muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" + muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config) uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" diff --git a/game_tournament2.py b/game_tournament2.py new file mode 100644 index 00000000..cfdd56d6 --- /dev/null +++ b/game_tournament2.py @@ -0,0 +1,389 @@ +import pickle + +import torch +import copy +import numpy + +from games.tictactoe import MuZeroConfig, Game +import models +import simplifiedMuZero.net2.models2 as models2 +from self_play import MCTS, GameHistory,SelfPlay + +class GameTournament: + def __init__(self, config:MuZeroConfig): + self.models = [] + self.game = Game(config.seed) + self.config = config + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 0 + + def have_winner(self): + # Horizontal and vertical checks + for i in range(3): + if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + + # Diagonal checks + if ( + self.board[0, 0] == self.player + and self.board[1, 1] == self.player + and self.board[2, 2] == self.player + ): + return True + if ( + self.board[2, 0] == self.player + and self.board[1, 1] == self.player + and self.board[0, 2] == self.player + ): + return True + + return False + + def play_competition(self, model1, search_policy1, model2, search_policy2): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model1.eval() + model2.eval() + + is_model1 = True + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + model = model1 if is_model1 else model2 + search_policy = search_policy1 if is_model1 else search_policy2 + + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model1 = not is_model1 + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model1 == (reward > 0) + + def play_with_expert(self, model, search_policy, expert_first=True): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model.eval() + + is_model = not expert_first + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + + if is_model: + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + else: + action = self.game.expert_agent() + root = None + + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model = not is_model + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model == (reward > 0) + + def close_game(self): + self.game.close() + + def play_tournament(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) + + if have_winner: + if is_model1: + model1_win_num += 1 + else: + model2_win_num += 1 + else: + no_winner_num += 1 + + # # 交换顺序,再来一遍 + # for _ in range(rollnum): + # have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS) + # + # if have_winner: + # if is_model1: + # model2_win_num += 1 + # else: + # model1_win_num += 1 + # else: + # no_winner_num += 1 + + # print(is_model1) + + print(models[i]["name"]," ,", models[j]["name"]," : ") + + print(models[i]["name"], " win : ", model1_win_num) + print(models[j]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) + + if have_winner: + if is_model1: + model1_win_num += 1 + else: + model2_win_num += 1 + else: + no_winner_num += 1 + + + print(models[j]["name"]," ,", models[i]["name"]," : ") + + print(models[j]["name"], " win : ", model1_win_num) + print(models[i]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + def play_tournament_with_expert(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + model = models[i]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + + print(models[i]["name"], " ,", "expert : ") + + print(models[i]["name"], " win : ", model_win_num) + print("expert win : ", expert_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + for _ in range(rollnum): + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + print("expert : ", " ,", models[i]["name"]) + + print("expert win : ", expert_win_num) + print(models[i]["name"], " win : ", model_win_num) + print("No Winner", no_winner_num) + print("===================================") + + + +def load_model(model_cls, model_path, config): + checkpoint = torch.load(model_path) + model = model_cls(config) + model.set_weights(checkpoint["weights"]) + + return model + + +if __name__ == "__main__": + config = MuZeroConfig() + + config.network = "fullyconnected" + # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-23--14-25-59\model.checkpoint" + muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config) + + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) + + config2 = MuZeroConfig() + config2.network = "resnet" + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" + muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" + muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2) + + # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" + # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) + # + # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" + # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) + # + # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) + # + # + # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) + # + # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" + # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) + + + game_tournament = GameTournament(config) + + models = [ + {"name":"muzero_2net", "model":muzero_2net_model}, + # {"name":"uniform", "model":uniform_model}, + {"name":"muzero", "model":muzero_model}, + # {"name": "without_rb", "model": without_rb_model}, + # {"name": "no policy value", "model": muzero_no_policy_model}, + # {"name": "simplified_muzero", "model": without_rb_model}, + ] + + + # game_tournament.play_tournament(models, rollnum=1000) + game_tournament.play_tournament(models, rollnum=10) + game_tournament.play_tournament_with_expert(models, rollnum=100) + + game_tournament.close_game() + diff --git a/game_tournament3.py b/game_tournament3.py new file mode 100644 index 00000000..14d1dec7 --- /dev/null +++ b/game_tournament3.py @@ -0,0 +1,390 @@ +import pickle + +import torch +import copy +import numpy + +from games.tictactoe2 import MuZeroConfig, Game +import models +import simplifiedMuZero.net2.models2 as models2 +from self_play import MCTS, GameHistory,SelfPlay + +class GameTournament: + def __init__(self, config:MuZeroConfig): + self.models = [] + self.game = Game(config.seed) + self.config = config + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 0 + + def have_winner(self): + # Horizontal and vertical checks + for i in range(3): + if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + + # Diagonal checks + if ( + self.board[0, 0] == self.player + and self.board[1, 1] == self.player + and self.board[2, 2] == self.player + ): + return True + if ( + self.board[2, 0] == self.player + and self.board[1, 1] == self.player + and self.board[0, 2] == self.player + ): + return True + + return False + + def play_competition(self, model1, search_policy1, model2, search_policy2): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model1.eval() + model2.eval() + + is_model1 = True + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + model = model1 if is_model1 else model2 + search_policy = search_policy1 if is_model1 else search_policy2 + + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model1 = not is_model1 + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model1 == (reward > 0) + + def play_with_expert(self, model, search_policy, expert_first=True): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model.eval() + + is_model = not expert_first + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + + if is_model: + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + else: + action = self.game.expert_agent() + root = None + + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model = not is_model + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model == (reward > 0) + + def close_game(self): + self.game.close() + + def play_tournament(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) + + if have_winner: + if is_model1: + model1_win_num += 1 + else: + model2_win_num += 1 + else: + no_winner_num += 1 + + # # 交换顺序,再来一遍 + # for _ in range(rollnum): + # have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS) + # + # if have_winner: + # if is_model1: + # model2_win_num += 1 + # else: + # model1_win_num += 1 + # else: + # no_winner_num += 1 + + # print(is_model1) + + print(models[i]["name"]," ,", models[j]["name"]," : ") + + print(models[i]["name"], " win : ", model1_win_num) + print(models[j]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) + + if have_winner: + if is_model1: + model1_win_num += 1 + else: + model2_win_num += 1 + else: + no_winner_num += 1 + + + print(models[j]["name"]," ,", models[i]["name"]," : ") + + print(models[j]["name"], " win : ", model1_win_num) + print(models[i]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + def play_tournament_with_expert(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + model = models[i]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + + print(models[i]["name"], " ,", "expert : ") + + print(models[i]["name"], " win : ", model_win_num) + print("expert win : ", expert_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + for _ in range(rollnum): + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + print("expert : ", " ,", models[i]["name"]) + + print("expert win : ", expert_win_num) + print(models[i]["name"], " win : ", model_win_num) + print("No Winner", no_winner_num) + print("===================================") + + + +def load_model(model_cls, model_path, config): + checkpoint = torch.load(model_path) + model = model_cls(config) + model.set_weights(checkpoint["weights"]) + + return model + + +if __name__ == "__main__": + config = MuZeroConfig() + + # config.network = "fullyconnected" + # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint" + muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config) + + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) + + config2 = MuZeroConfig() + config2.network = "resnet" + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" + muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" + muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2) + + # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" + # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) + # + # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" + # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) + # + # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) + # + # + # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) + # + # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" + # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) + + + game_tournament = GameTournament(config) + + models = [ + {"name":"muzero_2net", "model":muzero_2net_model}, + # {"name":"uniform", "model":uniform_model}, + {"name":"muzero", "model":muzero_model}, + {"name": "muzero2", "model": muzero_model}, + # {"name": "without_rb", "model": without_rb_model}, + # {"name": "no policy value", "model": muzero_no_policy_model}, + # {"name": "simplified_muzero", "model": without_rb_model}, + ] + + + # game_tournament.play_tournament(models, rollnum=1000) + game_tournament.play_tournament(models, rollnum=10) + game_tournament.play_tournament_with_expert(models, rollnum=10) + + game_tournament.close_game() + diff --git a/games/tictactoe2.py b/games/tictactoe2.py new file mode 100644 index 00000000..ff9a90bf --- /dev/null +++ b/games/tictactoe2.py @@ -0,0 +1,361 @@ +import datetime +import pathlib + +import numpy +import torch + +from .abstract_game import AbstractGame + + +class MuZeroConfig: + def __init__(self): + # fmt: off + # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization + + self.seed = 0 # Seed for numpy, torch and the game + self.max_num_gpus = None # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available + + + + ### Game + self.observation_shape = (3, 3, 3) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) + self.action_space = list(range(9)) # Fixed list of all possible actions. You should only edit the length + self.players = list(range(2)) # List of players. You should only edit the length + self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + + # Evaluate + self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) + self.opponent = "expert" # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class + + # 动作是否能重复 + self.action_replace = False + + ### Self-Play + self.num_workers = 1 # Number of simultaneous threads/workers self-playing to feed the replay buffer + self.selfplay_on_gpu = False + self.max_moves = 9 # Maximum number of moves if game is not finished before + self.num_simulations = 25 # Number of future moves self-simulated + self.discount = 1 # Chronological discount of the reward + self.temperature_threshold = None # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time + + # Root prior exploration noise + self.root_dirichlet_alpha = 0.1 + self.root_exploration_fraction = 0.25 + + # UCB formula + self.pb_c_base = 19652 + self.pb_c_init = 1.25 + + + + ### Network + # self.network = "resnet" # "resnet" / "fullyconnected" + self.network = "fullyconnected" + self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward))) + + # Residual Network + self.downsample = False # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture) + self.blocks = 1 # Number of blocks in the ResNet + self.channels = 16 # Number of channels in the ResNet + self.reduced_channels_reward = 16 # Number of channels in reward head + self.reduced_channels_value = 16 # Number of channels in value head + self.reduced_channels_policy = 16 # Number of channels in policy head + self.resnet_fc_reward_layers = [8] # Define the hidden layers in the reward head of the dynamic network + self.resnet_fc_value_layers = [8] # Define the hidden layers in the value head of the prediction network + self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network + + # Fully Connected Network + # self.encoding_size = 32 + # self.fc_representation_layers = [] # Define the hidden layers in the representation network + # self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network + # self.fc_reward_layers = [16] # Define the hidden layers in the reward network + # self.fc_value_layers = [] # Define the hidden layers in the value network + # self.fc_policy_layers = [] # Define the hidden layers in the policy network + + self.encoding_size = 32 + self.fc_representation_layers = [16] # Define the hidden layers in the representation network + self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network + self.fc_reward_layers = [16] # Define the hidden layers in the reward network + self.fc_value_layers = [16] # Define the hidden layers in the value network + self.fc_policy_layers = [16] + + + ### Training + self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs + self.save_model = True # Save the checkpoint in results_path as model.checkpoint + # self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) + # self.training_steps = 50000 + self.training_steps = 500000 + self.batch_size = 64 # Number of parts of games to train on at each training step + self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing + self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) + self.train_on_gpu = torch.cuda.is_available() # Train on GPU if available + + self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD + self.weight_decay = 1e-4 # L2 weights regularization + self.momentum = 0.9 # Used only if optimizer is SGD + + # Exponential learning rate schedule + self.lr_init = 0.003 # Initial learning rate + self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate + self.lr_decay_steps = 10000 + + + + ### Replay Buffer + self.replay_buffer_size = 3000 # Number of self-play games to keep in the replay buffer + self.num_unroll_steps = 20 # Number of game moves to keep for every batch element + self.td_steps = 20 # Number of steps in the future to take into account for calculating the target value + self.PER = True # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network + self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 + + # Reanalyze (See paper appendix Reanalyse) + self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) + self.reanalyse_on_gpu = False + + + + ### Adjust the self play / training ratio to avoid over/underfitting + self.self_play_delay = 0 # Number of seconds to wait after each played game + self.training_delay = 0 # Number of seconds to wait after each training step + self.ratio = None # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it + # fmt: on + + def visit_softmax_temperature_fn(self, trained_steps): + """ + Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. + The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. + + Returns: + Positive float. + """ + return 1 + + +class Game(AbstractGame): + """ + Game wrapper. + """ + + def __init__(self, seed=None): + self.env = TicTacToe() + + def step(self, action): + """ + Apply action to the game. + + Args: + action : action of the action_space to take. + + Returns: + The new observation, the reward and a boolean if the game has ended. + """ + observation, reward, done = self.env.step(action) + return observation, reward * 20, done + + def to_play(self): + """ + Return the current player. + + Returns: + The current player, it should be an element of the players list in the config. + """ + return self.env.to_play() + + def legal_actions(self): + """ + Should return the legal actions at each turn, if it is not available, it can return + the whole action space. At each turn, the game have to be able to handle one of returned actions. + + For complex game where calculating legal moves is too long, the idea is to define the legal actions + equal to the action space but to return a negative reward if the action is illegal. + + Returns: + An array of integers, subset of the action space. + """ + return self.env.legal_actions() + + def reset(self): + """ + Reset the game for a new game. + + Returns: + Initial observation of the game. + """ + return self.env.reset() + + def render(self): + """ + Display the game observation. + """ + self.env.render() + input("Press enter to take a step ") + + def human_to_action(self): + """ + For multiplayer games, ask the user for a legal action + and return the corresponding action number. + + Returns: + An integer from the action space. + """ + while True: + try: + row = int( + input( + f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: " + ) + ) + col = int( + input( + f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: " + ) + ) + choice = (row - 1) * 3 + (col - 1) + if ( + choice in self.legal_actions() + and 1 <= row + and 1 <= col + and row <= 3 + and col <= 3 + ): + break + except: + pass + print("Wrong input, try again") + return choice + + def expert_agent(self): + """ + Hard coded agent that MuZero faces to assess his progress in multiplayer games. + It doesn't influence training + + Returns: + Action as an integer to take in the current game state + """ + return self.env.expert_action() + + def action_to_string(self, action_number): + """ + Convert an action number to a string representing the action. + + Args: + action_number: an integer from the action space. + + Returns: + String representing the action. + """ + row = action_number // 3 + 1 + col = action_number % 3 + 1 + return f"Play row {row}, column {col}" + + +class TicTacToe: + def __init__(self): + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 1 + + def to_play(self): + return 0 if self.player == 1 else 1 + + def reset(self): + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 1 + return self.get_observation() + + def step(self, action): + row = action // 3 + col = action % 3 + self.board[row, col] = self.player + + done = self.have_winner() or len(self.legal_actions()) == 0 + + reward = 1 if self.have_winner() else 0 + + self.player *= -1 + + return self.get_observation(), reward, done + + def get_observation(self): + board_player1 = numpy.where(self.board == 1, 1, 0) + board_player2 = numpy.where(self.board == -1, 1, 0) + board_to_play = numpy.full((3, 3), self.player) + return numpy.array([board_player1, board_player2, board_to_play], dtype="int32") + + def legal_actions(self): + legal = [] + for i in range(9): + row = i // 3 + col = i % 3 + if self.board[row, col] == 0: + legal.append(i) + return legal + + def have_winner(self): + # Horizontal and vertical checks + for i in range(3): + if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + + # Diagonal checks + if ( + self.board[0, 0] == self.player + and self.board[1, 1] == self.player + and self.board[2, 2] == self.player + ): + return True + if ( + self.board[2, 0] == self.player + and self.board[1, 1] == self.player + and self.board[0, 2] == self.player + ): + return True + + return False + + def expert_action(self): + board = self.board + action = numpy.random.choice(self.legal_actions()) + # Horizontal and vertical checks + for i in range(3): + if abs(sum(board[i, :])) == 2: + ind = numpy.where(board[i, :] == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([i]), numpy.array([ind])), (3, 3) + )[0] + if self.player * sum(board[i, :]) > 0: + return action + + if abs(sum(board[:, i])) == 2: + ind = numpy.where(board[:, i] == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([ind]), numpy.array([i])), (3, 3) + )[0] + if self.player * sum(board[:, i]) > 0: + return action + + # Diagonal checks + diag = board.diagonal() + anti_diag = numpy.fliplr(board).diagonal() + if abs(sum(diag)) == 2: + ind = numpy.where(diag == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([ind]), numpy.array([ind])), (3, 3) + )[0] + if self.player * sum(diag) > 0: + return action + + if abs(sum(anti_diag)) == 2: + ind = numpy.where(anti_diag == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([ind]), numpy.array([2 - ind])), (3, 3) + )[0] + if self.player * sum(anti_diag) > 0: + return action + + return action + + def render(self): + print(self.board[::-1]) diff --git a/games/tictactoe3.py b/games/tictactoe3.py new file mode 100644 index 00000000..1078bff0 --- /dev/null +++ b/games/tictactoe3.py @@ -0,0 +1,354 @@ +import datetime +import pathlib + +import numpy +import torch + +from .abstract_game import AbstractGame + + +class MuZeroConfig: + def __init__(self): + # fmt: off + # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization + + self.seed = 0 # Seed for numpy, torch and the game + self.max_num_gpus = None # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available + + + + ### Game + self.observation_shape = (3, 3, 3) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) + self.action_space = list(range(9)) # Fixed list of all possible actions. You should only edit the length + self.players = list(range(2)) # List of players. You should only edit the length + self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + + # Evaluate + self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) + self.opponent = "expert" # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class + + # 动作是否能重复 + self.action_replace = False + + ### Self-Play + self.num_workers = 1 # Number of simultaneous threads/workers self-playing to feed the replay buffer + self.selfplay_on_gpu = False + self.max_moves = 9 # Maximum number of moves if game is not finished before + self.num_simulations = 25 # Number of future moves self-simulated + self.discount = 1 # Chronological discount of the reward + self.temperature_threshold = None # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time + + # Root prior exploration noise + self.root_dirichlet_alpha = 0.1 + self.root_exploration_fraction = 0.25 + + # UCB formula + self.pb_c_base = 19652 + self.pb_c_init = 1.25 + + + + ### Network + self.network = "resnet" # "resnet" / "fullyconnected" + self.network = "fullyconnected" + self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward))) + + # Residual Network + self.downsample = False # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture) + self.blocks = 1 # Number of blocks in the ResNet + self.channels = 16 # Number of channels in the ResNet + self.reduced_channels_reward = 16 # Number of channels in reward head + self.reduced_channels_value = 16 # Number of channels in value head + self.reduced_channels_policy = 16 # Number of channels in policy head + self.resnet_fc_reward_layers = [8] # Define the hidden layers in the reward head of the dynamic network + self.resnet_fc_value_layers = [8] # Define the hidden layers in the value head of the prediction network + self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network + + # Fully Connected Network + self.encoding_size = 32 + self.fc_representation_layers = [] # Define the hidden layers in the representation network + self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network + self.fc_reward_layers = [16] # Define the hidden layers in the reward network + self.fc_value_layers = [] # Define the hidden layers in the value network + self.fc_policy_layers = [] # Define the hidden layers in the policy network + + + + ### Training + self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs + self.save_model = True # Save the checkpoint in results_path as model.checkpoint + self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) + # self.training_steps = 50000 + self.batch_size = 64 # Number of parts of games to train on at each training step + self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing + self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) + self.train_on_gpu = torch.cuda.is_available() # Train on GPU if available + + self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD + self.weight_decay = 1e-4 # L2 weights regularization + self.momentum = 0.9 # Used only if optimizer is SGD + + # Exponential learning rate schedule + self.lr_init = 0.003 # Initial learning rate + self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate + self.lr_decay_steps = 10000 + + + + ### Replay Buffer + self.replay_buffer_size = 3000 # Number of self-play games to keep in the replay buffer + self.num_unroll_steps = 20 # Number of game moves to keep for every batch element + self.td_steps = 20 # Number of steps in the future to take into account for calculating the target value + self.PER = True # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network + self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 + + # Reanalyze (See paper appendix Reanalyse) + self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) + self.reanalyse_on_gpu = False + + + + ### Adjust the self play / training ratio to avoid over/underfitting + self.self_play_delay = 0 # Number of seconds to wait after each played game + self.training_delay = 0 # Number of seconds to wait after each training step + self.ratio = None # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it + # fmt: on + + def visit_softmax_temperature_fn(self, trained_steps): + """ + Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. + The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. + + Returns: + Positive float. + """ + return 1 + + +class Game(AbstractGame): + """ + Game wrapper. + """ + + def __init__(self, seed=None): + self.env = TicTacToe() + + def step(self, action): + """ + Apply action to the game. + + Args: + action : action of the action_space to take. + + Returns: + The new observation, the reward and a boolean if the game has ended. + """ + observation, reward, done = self.env.step(action) + return observation, reward * 20, done + + def to_play(self): + """ + Return the current player. + + Returns: + The current player, it should be an element of the players list in the config. + """ + return self.env.to_play() + + def legal_actions(self): + """ + Should return the legal actions at each turn, if it is not available, it can return + the whole action space. At each turn, the game have to be able to handle one of returned actions. + + For complex game where calculating legal moves is too long, the idea is to define the legal actions + equal to the action space but to return a negative reward if the action is illegal. + + Returns: + An array of integers, subset of the action space. + """ + return self.env.legal_actions() + + def reset(self): + """ + Reset the game for a new game. + + Returns: + Initial observation of the game. + """ + return self.env.reset() + + def render(self): + """ + Display the game observation. + """ + self.env.render() + input("Press enter to take a step ") + + def human_to_action(self): + """ + For multiplayer games, ask the user for a legal action + and return the corresponding action number. + + Returns: + An integer from the action space. + """ + while True: + try: + row = int( + input( + f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: " + ) + ) + col = int( + input( + f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: " + ) + ) + choice = (row - 1) * 3 + (col - 1) + if ( + choice in self.legal_actions() + and 1 <= row + and 1 <= col + and row <= 3 + and col <= 3 + ): + break + except: + pass + print("Wrong input, try again") + return choice + + def expert_agent(self): + """ + Hard coded agent that MuZero faces to assess his progress in multiplayer games. + It doesn't influence training + + Returns: + Action as an integer to take in the current game state + """ + return self.env.expert_action() + + def action_to_string(self, action_number): + """ + Convert an action number to a string representing the action. + + Args: + action_number: an integer from the action space. + + Returns: + String representing the action. + """ + row = action_number // 3 + 1 + col = action_number % 3 + 1 + return f"Play row {row}, column {col}" + + +class TicTacToe: + def __init__(self): + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 1 + + def to_play(self): + return 0 if self.player == 1 else 1 + + def reset(self): + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 1 + return self.get_observation() + + def step(self, action): + row = action // 3 + col = action % 3 + self.board[row, col] = self.player + + done = self.have_winner() or len(self.legal_actions()) == 0 + + reward = 1 if self.have_winner() else 0 + + self.player *= -1 + + return self.get_observation(), reward, done + + def get_observation(self): + board_player1 = numpy.where(self.board == 1, 1, 0) + board_player2 = numpy.where(self.board == -1, 1, 0) + board_to_play = numpy.full((3, 3), self.player) + return numpy.array([board_player1, board_player2, board_to_play], dtype="int32") + + def legal_actions(self): + legal = [] + for i in range(9): + row = i // 3 + col = i % 3 + if self.board[row, col] == 0: + legal.append(i) + return legal + + def have_winner(self): + # Horizontal and vertical checks + for i in range(3): + if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + + # Diagonal checks + if ( + self.board[0, 0] == self.player + and self.board[1, 1] == self.player + and self.board[2, 2] == self.player + ): + return True + if ( + self.board[2, 0] == self.player + and self.board[1, 1] == self.player + and self.board[0, 2] == self.player + ): + return True + + return False + + def expert_action(self): + board = self.board + action = numpy.random.choice(self.legal_actions()) + # Horizontal and vertical checks + for i in range(3): + if abs(sum(board[i, :])) == 2: + ind = numpy.where(board[i, :] == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([i]), numpy.array([ind])), (3, 3) + )[0] + if self.player * sum(board[i, :]) > 0: + return action + + if abs(sum(board[:, i])) == 2: + ind = numpy.where(board[:, i] == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([ind]), numpy.array([i])), (3, 3) + )[0] + if self.player * sum(board[:, i]) > 0: + return action + + # Diagonal checks + diag = board.diagonal() + anti_diag = numpy.fliplr(board).diagonal() + if abs(sum(diag)) == 2: + ind = numpy.where(diag == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([ind]), numpy.array([ind])), (3, 3) + )[0] + if self.player * sum(diag) > 0: + return action + + if abs(sum(anti_diag)) == 2: + ind = numpy.where(anti_diag == 0)[0][0] + action = numpy.ravel_multi_index( + (numpy.array([ind]), numpy.array([2 - ind])), (3, 3) + )[0] + if self.player * sum(anti_diag) > 0: + return action + + return action + + def render(self): + print(self.board[::-1]) From 2747fdcc7fb7d6a7f672d60b3e32c37b1c80b7c4 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Fri, 25 Aug 2023 02:33:49 +0100 Subject: [PATCH 8/9] organize files --- game_tournament.py | 56 +-- game_tournament2.py | 389 ------------------ game_tournament3.py | 390 ------------------- games/tictactoe.py | 20 +- games/tictactoe2.py | 361 ----------------- games/tictactoe3.py | 354 ----------------- muzero_2net.py | 1 - muzero_general.py | 5 +- simplifiedMuZero/without_rb/trainer_no_PV.py | 243 ++++++++++++ 9 files changed, 292 insertions(+), 1527 deletions(-) delete mode 100644 game_tournament2.py delete mode 100644 game_tournament3.py delete mode 100644 games/tictactoe2.py delete mode 100644 games/tictactoe3.py create mode 100644 simplifiedMuZero/without_rb/trainer_no_PV.py diff --git a/game_tournament.py b/game_tournament.py index 8c87e7ef..81b1e363 100644 --- a/game_tournament.py +++ b/game_tournament.py @@ -338,49 +338,55 @@ def load_model(model_cls, model_path, config): if __name__ == "__main__": config = MuZeroConfig() + # config.network = "fullyconnected" # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" - checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--09-40-26\model.checkpoint" + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint" + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--17-12-53\model.checkpoint" muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config) # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) + config2 = MuZeroConfig() + # config2.network = "resnet" # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" - muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" - muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config) - - uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" - uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) - - without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" - without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) - - muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) - - - simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) - - # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" - # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" + muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-24--02-55-21\muzero_2net\model.checkpoint" + muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2) + + # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" + # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) + # + # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" + # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) + # + # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) + # + # + # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) + # + # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" + # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) game_tournament = GameTournament(config) models = [ {"name":"muzero_2net", "model":muzero_2net_model}, - {"name":"uniform", "model":uniform_model}, + # {"name":"uniform", "model":uniform_model}, {"name":"muzero", "model":muzero_model}, - {"name": "without_rb", "model": without_rb_model}, - {"name": "no policy value", "model": muzero_no_policy_model}, - {"name": "simplified_muzero", "model": without_rb_model}, + # {"name": "muzero2", "model": muzero_model}, + # {"name": "without_rb", "model": without_rb_model}, + # {"name": "no policy value", "model": muzero_no_policy_model}, + # {"name": "simplified_muzero", "model": without_rb_model}, ] # game_tournament.play_tournament(models, rollnum=1000) - game_tournament.play_tournament(models, rollnum=10) - game_tournament.play_tournament_with_expert(models, rollnum=100) + # game_tournament.play_tournament(models, rollnum=1000) + game_tournament.play_tournament_with_expert(models, rollnum=500) game_tournament.close_game() diff --git a/game_tournament2.py b/game_tournament2.py deleted file mode 100644 index cfdd56d6..00000000 --- a/game_tournament2.py +++ /dev/null @@ -1,389 +0,0 @@ -import pickle - -import torch -import copy -import numpy - -from games.tictactoe import MuZeroConfig, Game -import models -import simplifiedMuZero.net2.models2 as models2 -from self_play import MCTS, GameHistory,SelfPlay - -class GameTournament: - def __init__(self, config:MuZeroConfig): - self.models = [] - self.game = Game(config.seed) - self.config = config - self.board = numpy.zeros((3, 3), dtype="int32") - self.player = 0 - - def have_winner(self): - # Horizontal and vertical checks - for i in range(3): - if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - - # Diagonal checks - if ( - self.board[0, 0] == self.player - and self.board[1, 1] == self.player - and self.board[2, 2] == self.player - ): - return True - if ( - self.board[2, 0] == self.player - and self.board[1, 1] == self.player - and self.board[0, 2] == self.player - ): - return True - - return False - - def play_competition(self, model1, search_policy1, model2, search_policy2): - game_history = GameHistory() - - observation = self.game.reset() - - game_history.action_history.append(0) - game_history.observation_history.append(observation) # 添加reset之后的observation - game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) - - done = False - - model1.eval() - model2.eval() - - is_model1 = True - while not done: - assert ( - len(numpy.array(observation).shape) == 3 - ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" - assert ( - numpy.array(observation).shape == self.config.observation_shape - ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." - stacked_observations = game_history.get_stacked_observations( - -1, self.config.stacked_observations, len(self.config.action_space) - ) - - model = model1 if is_model1 else model2 - search_policy = search_policy1 if is_model1 else search_policy2 - - root, mcts_info = search_policy(self.config).run( - model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 - True, - ) - - action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 - observation, reward, done = self.game.step(action) - - game_history.store_search_statistics(root, self.config.action_space) - - # Next batch - game_history.action_history.append(action) - game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 - game_history.reward_history.append(reward) - game_history.to_play_history.append(self.game.to_play()) - - # 如果没有结束,就取反 - if not done: - is_model1 = not is_model1 - - # print("is model",is_model1, "reward is ", reward) - - # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 - self.game.env.player *= -1 - - # 返回值处理 - # |-----|-----|-----| - # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 - # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 - # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 - # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 - return self.game.env.have_winner(), is_model1 == (reward > 0) - - def play_with_expert(self, model, search_policy, expert_first=True): - game_history = GameHistory() - - observation = self.game.reset() - - game_history.action_history.append(0) - game_history.observation_history.append(observation) # 添加reset之后的observation - game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) - - done = False - - model.eval() - - is_model = not expert_first - while not done: - assert ( - len(numpy.array(observation).shape) == 3 - ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" - assert ( - numpy.array(observation).shape == self.config.observation_shape - ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." - stacked_observations = game_history.get_stacked_observations( - -1, self.config.stacked_observations, len(self.config.action_space) - ) - - - if is_model: - root, mcts_info = search_policy(self.config).run( - model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 - True, - ) - action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 - else: - action = self.game.expert_agent() - root = None - - observation, reward, done = self.game.step(action) - - game_history.store_search_statistics(root, self.config.action_space) - - # Next batch - game_history.action_history.append(action) - game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 - game_history.reward_history.append(reward) - game_history.to_play_history.append(self.game.to_play()) - - # 如果没有结束,就取反 - if not done: - is_model = not is_model - - # print("is model",is_model1, "reward is ", reward) - - # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 - self.game.env.player *= -1 - - # 返回值处理 - # |-----|-----|-----| - # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 - # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 - # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 - # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 - return self.game.env.have_winner(), is_model == (reward > 0) - - def close_game(self): - self.game.close() - - def play_tournament(self, models, rollnum=1000): - model_num = len(models) - - for i in range(model_num): - for j in range(i+1, model_num): - model1 = models[i]["model"] - model2 = models[j]["model"] - - # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) - model1_win_num = 0 - model2_win_num = 0 - no_winner_num = 0 - - for _ in range(rollnum): - have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) - - if have_winner: - if is_model1: - model1_win_num += 1 - else: - model2_win_num += 1 - else: - no_winner_num += 1 - - # # 交换顺序,再来一遍 - # for _ in range(rollnum): - # have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS) - # - # if have_winner: - # if is_model1: - # model2_win_num += 1 - # else: - # model1_win_num += 1 - # else: - # no_winner_num += 1 - - # print(is_model1) - - print(models[i]["name"]," ,", models[j]["name"]," : ") - - print(models[i]["name"], " win : ", model1_win_num) - print(models[j]["name"], " win : ", model2_win_num) - print("No Winner", no_winner_num) - print("===================================") - - model1_win_num = 0 - model2_win_num = 0 - no_winner_num = 0 - for i in range(model_num): - for j in range(i+1, model_num): - model1 = models[i]["model"] - model2 = models[j]["model"] - - # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) - model1_win_num = 0 - model2_win_num = 0 - no_winner_num = 0 - - for _ in range(rollnum): - have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) - - if have_winner: - if is_model1: - model1_win_num += 1 - else: - model2_win_num += 1 - else: - no_winner_num += 1 - - - print(models[j]["name"]," ,", models[i]["name"]," : ") - - print(models[j]["name"], " win : ", model1_win_num) - print(models[i]["name"], " win : ", model2_win_num) - print("No Winner", no_winner_num) - print("===================================") - - def play_tournament_with_expert(self, models, rollnum=1000): - model_num = len(models) - - for i in range(model_num): - model = models[i]["model"] - - # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) - model_win_num = 0 - expert_win_num = 0 - no_winner_num = 0 - - for _ in range(rollnum): - have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) - - if have_winner: - if is_model: - model_win_num += 1 - else: - expert_win_num += 1 - else: - no_winner_num += 1 - - # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) - # - # if have_winner: - # if is_model: - # model_win_num += 1 - # else: - # expert_win_num += 1 - # else: - # no_winner_num += 1 - - - print(models[i]["name"], " ,", "expert : ") - - print(models[i]["name"], " win : ", model_win_num) - print("expert win : ", expert_win_num) - print("No Winner", no_winner_num) - print("===================================") - - model_win_num = 0 - expert_win_num = 0 - no_winner_num = 0 - for _ in range(rollnum): - # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) - # - # if have_winner: - # if is_model: - # model_win_num += 1 - # else: - # expert_win_num += 1 - # else: - # no_winner_num += 1 - - have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) - - if have_winner: - if is_model: - model_win_num += 1 - else: - expert_win_num += 1 - else: - no_winner_num += 1 - - print("expert : ", " ,", models[i]["name"]) - - print("expert win : ", expert_win_num) - print(models[i]["name"], " win : ", model_win_num) - print("No Winner", no_winner_num) - print("===================================") - - - -def load_model(model_cls, model_path, config): - checkpoint = torch.load(model_path) - model = model_cls(config) - model.set_weights(checkpoint["weights"]) - - return model - - -if __name__ == "__main__": - config = MuZeroConfig() - - config.network = "fullyconnected" - # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" - checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-23--14-25-59\model.checkpoint" - muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config) - - # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) - - config2 = MuZeroConfig() - config2.network = "resnet" - # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" - muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" - muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2) - - # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" - # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) - # - # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" - # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) - # - # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) - # - # - # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) - # - # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" - # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) - - - game_tournament = GameTournament(config) - - models = [ - {"name":"muzero_2net", "model":muzero_2net_model}, - # {"name":"uniform", "model":uniform_model}, - {"name":"muzero", "model":muzero_model}, - # {"name": "without_rb", "model": without_rb_model}, - # {"name": "no policy value", "model": muzero_no_policy_model}, - # {"name": "simplified_muzero", "model": without_rb_model}, - ] - - - # game_tournament.play_tournament(models, rollnum=1000) - game_tournament.play_tournament(models, rollnum=10) - game_tournament.play_tournament_with_expert(models, rollnum=100) - - game_tournament.close_game() - diff --git a/game_tournament3.py b/game_tournament3.py deleted file mode 100644 index 14d1dec7..00000000 --- a/game_tournament3.py +++ /dev/null @@ -1,390 +0,0 @@ -import pickle - -import torch -import copy -import numpy - -from games.tictactoe2 import MuZeroConfig, Game -import models -import simplifiedMuZero.net2.models2 as models2 -from self_play import MCTS, GameHistory,SelfPlay - -class GameTournament: - def __init__(self, config:MuZeroConfig): - self.models = [] - self.game = Game(config.seed) - self.config = config - self.board = numpy.zeros((3, 3), dtype="int32") - self.player = 0 - - def have_winner(self): - # Horizontal and vertical checks - for i in range(3): - if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - - # Diagonal checks - if ( - self.board[0, 0] == self.player - and self.board[1, 1] == self.player - and self.board[2, 2] == self.player - ): - return True - if ( - self.board[2, 0] == self.player - and self.board[1, 1] == self.player - and self.board[0, 2] == self.player - ): - return True - - return False - - def play_competition(self, model1, search_policy1, model2, search_policy2): - game_history = GameHistory() - - observation = self.game.reset() - - game_history.action_history.append(0) - game_history.observation_history.append(observation) # 添加reset之后的observation - game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) - - done = False - - model1.eval() - model2.eval() - - is_model1 = True - while not done: - assert ( - len(numpy.array(observation).shape) == 3 - ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" - assert ( - numpy.array(observation).shape == self.config.observation_shape - ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." - stacked_observations = game_history.get_stacked_observations( - -1, self.config.stacked_observations, len(self.config.action_space) - ) - - model = model1 if is_model1 else model2 - search_policy = search_policy1 if is_model1 else search_policy2 - - root, mcts_info = search_policy(self.config).run( - model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 - True, - ) - - action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 - observation, reward, done = self.game.step(action) - - game_history.store_search_statistics(root, self.config.action_space) - - # Next batch - game_history.action_history.append(action) - game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 - game_history.reward_history.append(reward) - game_history.to_play_history.append(self.game.to_play()) - - # 如果没有结束,就取反 - if not done: - is_model1 = not is_model1 - - # print("is model",is_model1, "reward is ", reward) - - # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 - self.game.env.player *= -1 - - # 返回值处理 - # |-----|-----|-----| - # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 - # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 - # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 - # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 - return self.game.env.have_winner(), is_model1 == (reward > 0) - - def play_with_expert(self, model, search_policy, expert_first=True): - game_history = GameHistory() - - observation = self.game.reset() - - game_history.action_history.append(0) - game_history.observation_history.append(observation) # 添加reset之后的observation - game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) - - done = False - - model.eval() - - is_model = not expert_first - while not done: - assert ( - len(numpy.array(observation).shape) == 3 - ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" - assert ( - numpy.array(observation).shape == self.config.observation_shape - ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." - stacked_observations = game_history.get_stacked_observations( - -1, self.config.stacked_observations, len(self.config.action_space) - ) - - - if is_model: - root, mcts_info = search_policy(self.config).run( - model, - stacked_observations, - self.game.legal_actions(), - self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 - True, - ) - action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 - else: - action = self.game.expert_agent() - root = None - - observation, reward, done = self.game.step(action) - - game_history.store_search_statistics(root, self.config.action_space) - - # Next batch - game_history.action_history.append(action) - game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 - game_history.reward_history.append(reward) - game_history.to_play_history.append(self.game.to_play()) - - # 如果没有结束,就取反 - if not done: - is_model = not is_model - - # print("is model",is_model1, "reward is ", reward) - - # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 - self.game.env.player *= -1 - - # 返回值处理 - # |-----|-----|-----| - # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 - # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 - # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 - # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 - return self.game.env.have_winner(), is_model == (reward > 0) - - def close_game(self): - self.game.close() - - def play_tournament(self, models, rollnum=1000): - model_num = len(models) - - for i in range(model_num): - for j in range(i+1, model_num): - model1 = models[i]["model"] - model2 = models[j]["model"] - - # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) - model1_win_num = 0 - model2_win_num = 0 - no_winner_num = 0 - - for _ in range(rollnum): - have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) - - if have_winner: - if is_model1: - model1_win_num += 1 - else: - model2_win_num += 1 - else: - no_winner_num += 1 - - # # 交换顺序,再来一遍 - # for _ in range(rollnum): - # have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS) - # - # if have_winner: - # if is_model1: - # model2_win_num += 1 - # else: - # model1_win_num += 1 - # else: - # no_winner_num += 1 - - # print(is_model1) - - print(models[i]["name"]," ,", models[j]["name"]," : ") - - print(models[i]["name"], " win : ", model1_win_num) - print(models[j]["name"], " win : ", model2_win_num) - print("No Winner", no_winner_num) - print("===================================") - - model1_win_num = 0 - model2_win_num = 0 - no_winner_num = 0 - for i in range(model_num): - for j in range(i+1, model_num): - model1 = models[i]["model"] - model2 = models[j]["model"] - - # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) - model1_win_num = 0 - model2_win_num = 0 - no_winner_num = 0 - - for _ in range(rollnum): - have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) - - if have_winner: - if is_model1: - model1_win_num += 1 - else: - model2_win_num += 1 - else: - no_winner_num += 1 - - - print(models[j]["name"]," ,", models[i]["name"]," : ") - - print(models[j]["name"], " win : ", model1_win_num) - print(models[i]["name"], " win : ", model2_win_num) - print("No Winner", no_winner_num) - print("===================================") - - def play_tournament_with_expert(self, models, rollnum=1000): - model_num = len(models) - - for i in range(model_num): - model = models[i]["model"] - - # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) - model_win_num = 0 - expert_win_num = 0 - no_winner_num = 0 - - for _ in range(rollnum): - have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) - - if have_winner: - if is_model: - model_win_num += 1 - else: - expert_win_num += 1 - else: - no_winner_num += 1 - - # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) - # - # if have_winner: - # if is_model: - # model_win_num += 1 - # else: - # expert_win_num += 1 - # else: - # no_winner_num += 1 - - - print(models[i]["name"], " ,", "expert : ") - - print(models[i]["name"], " win : ", model_win_num) - print("expert win : ", expert_win_num) - print("No Winner", no_winner_num) - print("===================================") - - model_win_num = 0 - expert_win_num = 0 - no_winner_num = 0 - for _ in range(rollnum): - # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) - # - # if have_winner: - # if is_model: - # model_win_num += 1 - # else: - # expert_win_num += 1 - # else: - # no_winner_num += 1 - - have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) - - if have_winner: - if is_model: - model_win_num += 1 - else: - expert_win_num += 1 - else: - no_winner_num += 1 - - print("expert : ", " ,", models[i]["name"]) - - print("expert win : ", expert_win_num) - print(models[i]["name"], " win : ", model_win_num) - print("No Winner", no_winner_num) - print("===================================") - - - -def load_model(model_cls, model_path, config): - checkpoint = torch.load(model_path) - model = model_cls(config) - model.set_weights(checkpoint["weights"]) - - return model - - -if __name__ == "__main__": - config = MuZeroConfig() - - # config.network = "fullyconnected" - # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" - checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint" - muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config) - - # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) - - config2 = MuZeroConfig() - config2.network = "resnet" - # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" - muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" - muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2) - - # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" - # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) - # - # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" - # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) - # - # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) - # - # - # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" - # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) - # - # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" - # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) - - - game_tournament = GameTournament(config) - - models = [ - {"name":"muzero_2net", "model":muzero_2net_model}, - # {"name":"uniform", "model":uniform_model}, - {"name":"muzero", "model":muzero_model}, - {"name": "muzero2", "model": muzero_model}, - # {"name": "without_rb", "model": without_rb_model}, - # {"name": "no policy value", "model": muzero_no_policy_model}, - # {"name": "simplified_muzero", "model": without_rb_model}, - ] - - - # game_tournament.play_tournament(models, rollnum=1000) - game_tournament.play_tournament(models, rollnum=10) - game_tournament.play_tournament_with_expert(models, rollnum=10) - - game_tournament.close_game() - diff --git a/games/tictactoe.py b/games/tictactoe.py index 787986fb..ff9a90bf 100644 --- a/games/tictactoe.py +++ b/games/tictactoe.py @@ -49,7 +49,8 @@ def __init__(self): ### Network - self.network = "resnet" # "resnet" / "fullyconnected" + # self.network = "resnet" # "resnet" / "fullyconnected" + self.network = "fullyconnected" self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward))) # Residual Network @@ -64,20 +65,27 @@ def __init__(self): self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network # Fully Connected Network + # self.encoding_size = 32 + # self.fc_representation_layers = [] # Define the hidden layers in the representation network + # self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network + # self.fc_reward_layers = [16] # Define the hidden layers in the reward network + # self.fc_value_layers = [] # Define the hidden layers in the value network + # self.fc_policy_layers = [] # Define the hidden layers in the policy network + self.encoding_size = 32 - self.fc_representation_layers = [] # Define the hidden layers in the representation network + self.fc_representation_layers = [16] # Define the hidden layers in the representation network self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network self.fc_reward_layers = [16] # Define the hidden layers in the reward network - self.fc_value_layers = [] # Define the hidden layers in the value network - self.fc_policy_layers = [] # Define the hidden layers in the policy network - + self.fc_value_layers = [16] # Define the hidden layers in the value network + self.fc_policy_layers = [16] ### Training self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs self.save_model = True # Save the checkpoint in results_path as model.checkpoint # self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) - self.training_steps = 50000 + # self.training_steps = 50000 + self.training_steps = 500000 self.batch_size = 64 # Number of parts of games to train on at each training step self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) diff --git a/games/tictactoe2.py b/games/tictactoe2.py deleted file mode 100644 index ff9a90bf..00000000 --- a/games/tictactoe2.py +++ /dev/null @@ -1,361 +0,0 @@ -import datetime -import pathlib - -import numpy -import torch - -from .abstract_game import AbstractGame - - -class MuZeroConfig: - def __init__(self): - # fmt: off - # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization - - self.seed = 0 # Seed for numpy, torch and the game - self.max_num_gpus = None # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available - - - - ### Game - self.observation_shape = (3, 3, 3) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) - self.action_space = list(range(9)) # Fixed list of all possible actions. You should only edit the length - self.players = list(range(2)) # List of players. You should only edit the length - self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation - - # Evaluate - self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) - self.opponent = "expert" # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class - - # 动作是否能重复 - self.action_replace = False - - ### Self-Play - self.num_workers = 1 # Number of simultaneous threads/workers self-playing to feed the replay buffer - self.selfplay_on_gpu = False - self.max_moves = 9 # Maximum number of moves if game is not finished before - self.num_simulations = 25 # Number of future moves self-simulated - self.discount = 1 # Chronological discount of the reward - self.temperature_threshold = None # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time - - # Root prior exploration noise - self.root_dirichlet_alpha = 0.1 - self.root_exploration_fraction = 0.25 - - # UCB formula - self.pb_c_base = 19652 - self.pb_c_init = 1.25 - - - - ### Network - # self.network = "resnet" # "resnet" / "fullyconnected" - self.network = "fullyconnected" - self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward))) - - # Residual Network - self.downsample = False # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture) - self.blocks = 1 # Number of blocks in the ResNet - self.channels = 16 # Number of channels in the ResNet - self.reduced_channels_reward = 16 # Number of channels in reward head - self.reduced_channels_value = 16 # Number of channels in value head - self.reduced_channels_policy = 16 # Number of channels in policy head - self.resnet_fc_reward_layers = [8] # Define the hidden layers in the reward head of the dynamic network - self.resnet_fc_value_layers = [8] # Define the hidden layers in the value head of the prediction network - self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network - - # Fully Connected Network - # self.encoding_size = 32 - # self.fc_representation_layers = [] # Define the hidden layers in the representation network - # self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network - # self.fc_reward_layers = [16] # Define the hidden layers in the reward network - # self.fc_value_layers = [] # Define the hidden layers in the value network - # self.fc_policy_layers = [] # Define the hidden layers in the policy network - - self.encoding_size = 32 - self.fc_representation_layers = [16] # Define the hidden layers in the representation network - self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network - self.fc_reward_layers = [16] # Define the hidden layers in the reward network - self.fc_value_layers = [16] # Define the hidden layers in the value network - self.fc_policy_layers = [16] - - - ### Training - self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs - self.save_model = True # Save the checkpoint in results_path as model.checkpoint - # self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) - # self.training_steps = 50000 - self.training_steps = 500000 - self.batch_size = 64 # Number of parts of games to train on at each training step - self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing - self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) - self.train_on_gpu = torch.cuda.is_available() # Train on GPU if available - - self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD - self.weight_decay = 1e-4 # L2 weights regularization - self.momentum = 0.9 # Used only if optimizer is SGD - - # Exponential learning rate schedule - self.lr_init = 0.003 # Initial learning rate - self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate - self.lr_decay_steps = 10000 - - - - ### Replay Buffer - self.replay_buffer_size = 3000 # Number of self-play games to keep in the replay buffer - self.num_unroll_steps = 20 # Number of game moves to keep for every batch element - self.td_steps = 20 # Number of steps in the future to take into account for calculating the target value - self.PER = True # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network - self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 - - # Reanalyze (See paper appendix Reanalyse) - self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) - self.reanalyse_on_gpu = False - - - - ### Adjust the self play / training ratio to avoid over/underfitting - self.self_play_delay = 0 # Number of seconds to wait after each played game - self.training_delay = 0 # Number of seconds to wait after each training step - self.ratio = None # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it - # fmt: on - - def visit_softmax_temperature_fn(self, trained_steps): - """ - Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. - The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. - - Returns: - Positive float. - """ - return 1 - - -class Game(AbstractGame): - """ - Game wrapper. - """ - - def __init__(self, seed=None): - self.env = TicTacToe() - - def step(self, action): - """ - Apply action to the game. - - Args: - action : action of the action_space to take. - - Returns: - The new observation, the reward and a boolean if the game has ended. - """ - observation, reward, done = self.env.step(action) - return observation, reward * 20, done - - def to_play(self): - """ - Return the current player. - - Returns: - The current player, it should be an element of the players list in the config. - """ - return self.env.to_play() - - def legal_actions(self): - """ - Should return the legal actions at each turn, if it is not available, it can return - the whole action space. At each turn, the game have to be able to handle one of returned actions. - - For complex game where calculating legal moves is too long, the idea is to define the legal actions - equal to the action space but to return a negative reward if the action is illegal. - - Returns: - An array of integers, subset of the action space. - """ - return self.env.legal_actions() - - def reset(self): - """ - Reset the game for a new game. - - Returns: - Initial observation of the game. - """ - return self.env.reset() - - def render(self): - """ - Display the game observation. - """ - self.env.render() - input("Press enter to take a step ") - - def human_to_action(self): - """ - For multiplayer games, ask the user for a legal action - and return the corresponding action number. - - Returns: - An integer from the action space. - """ - while True: - try: - row = int( - input( - f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: " - ) - ) - col = int( - input( - f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: " - ) - ) - choice = (row - 1) * 3 + (col - 1) - if ( - choice in self.legal_actions() - and 1 <= row - and 1 <= col - and row <= 3 - and col <= 3 - ): - break - except: - pass - print("Wrong input, try again") - return choice - - def expert_agent(self): - """ - Hard coded agent that MuZero faces to assess his progress in multiplayer games. - It doesn't influence training - - Returns: - Action as an integer to take in the current game state - """ - return self.env.expert_action() - - def action_to_string(self, action_number): - """ - Convert an action number to a string representing the action. - - Args: - action_number: an integer from the action space. - - Returns: - String representing the action. - """ - row = action_number // 3 + 1 - col = action_number % 3 + 1 - return f"Play row {row}, column {col}" - - -class TicTacToe: - def __init__(self): - self.board = numpy.zeros((3, 3), dtype="int32") - self.player = 1 - - def to_play(self): - return 0 if self.player == 1 else 1 - - def reset(self): - self.board = numpy.zeros((3, 3), dtype="int32") - self.player = 1 - return self.get_observation() - - def step(self, action): - row = action // 3 - col = action % 3 - self.board[row, col] = self.player - - done = self.have_winner() or len(self.legal_actions()) == 0 - - reward = 1 if self.have_winner() else 0 - - self.player *= -1 - - return self.get_observation(), reward, done - - def get_observation(self): - board_player1 = numpy.where(self.board == 1, 1, 0) - board_player2 = numpy.where(self.board == -1, 1, 0) - board_to_play = numpy.full((3, 3), self.player) - return numpy.array([board_player1, board_player2, board_to_play], dtype="int32") - - def legal_actions(self): - legal = [] - for i in range(9): - row = i // 3 - col = i % 3 - if self.board[row, col] == 0: - legal.append(i) - return legal - - def have_winner(self): - # Horizontal and vertical checks - for i in range(3): - if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - - # Diagonal checks - if ( - self.board[0, 0] == self.player - and self.board[1, 1] == self.player - and self.board[2, 2] == self.player - ): - return True - if ( - self.board[2, 0] == self.player - and self.board[1, 1] == self.player - and self.board[0, 2] == self.player - ): - return True - - return False - - def expert_action(self): - board = self.board - action = numpy.random.choice(self.legal_actions()) - # Horizontal and vertical checks - for i in range(3): - if abs(sum(board[i, :])) == 2: - ind = numpy.where(board[i, :] == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([i]), numpy.array([ind])), (3, 3) - )[0] - if self.player * sum(board[i, :]) > 0: - return action - - if abs(sum(board[:, i])) == 2: - ind = numpy.where(board[:, i] == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([ind]), numpy.array([i])), (3, 3) - )[0] - if self.player * sum(board[:, i]) > 0: - return action - - # Diagonal checks - diag = board.diagonal() - anti_diag = numpy.fliplr(board).diagonal() - if abs(sum(diag)) == 2: - ind = numpy.where(diag == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([ind]), numpy.array([ind])), (3, 3) - )[0] - if self.player * sum(diag) > 0: - return action - - if abs(sum(anti_diag)) == 2: - ind = numpy.where(anti_diag == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([ind]), numpy.array([2 - ind])), (3, 3) - )[0] - if self.player * sum(anti_diag) > 0: - return action - - return action - - def render(self): - print(self.board[::-1]) diff --git a/games/tictactoe3.py b/games/tictactoe3.py deleted file mode 100644 index 1078bff0..00000000 --- a/games/tictactoe3.py +++ /dev/null @@ -1,354 +0,0 @@ -import datetime -import pathlib - -import numpy -import torch - -from .abstract_game import AbstractGame - - -class MuZeroConfig: - def __init__(self): - # fmt: off - # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization - - self.seed = 0 # Seed for numpy, torch and the game - self.max_num_gpus = None # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available - - - - ### Game - self.observation_shape = (3, 3, 3) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array) - self.action_space = list(range(9)) # Fixed list of all possible actions. You should only edit the length - self.players = list(range(2)) # List of players. You should only edit the length - self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation - - # Evaluate - self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) - self.opponent = "expert" # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class - - # 动作是否能重复 - self.action_replace = False - - ### Self-Play - self.num_workers = 1 # Number of simultaneous threads/workers self-playing to feed the replay buffer - self.selfplay_on_gpu = False - self.max_moves = 9 # Maximum number of moves if game is not finished before - self.num_simulations = 25 # Number of future moves self-simulated - self.discount = 1 # Chronological discount of the reward - self.temperature_threshold = None # Number of moves before dropping the temperature given by visit_softmax_temperature_fn to 0 (ie selecting the best action). If None, visit_softmax_temperature_fn is used every time - - # Root prior exploration noise - self.root_dirichlet_alpha = 0.1 - self.root_exploration_fraction = 0.25 - - # UCB formula - self.pb_c_base = 19652 - self.pb_c_init = 1.25 - - - - ### Network - self.network = "resnet" # "resnet" / "fullyconnected" - self.network = "fullyconnected" - self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward))) - - # Residual Network - self.downsample = False # Downsample observations before representation network, False / "CNN" (lighter) / "resnet" (See paper appendix Network Architecture) - self.blocks = 1 # Number of blocks in the ResNet - self.channels = 16 # Number of channels in the ResNet - self.reduced_channels_reward = 16 # Number of channels in reward head - self.reduced_channels_value = 16 # Number of channels in value head - self.reduced_channels_policy = 16 # Number of channels in policy head - self.resnet_fc_reward_layers = [8] # Define the hidden layers in the reward head of the dynamic network - self.resnet_fc_value_layers = [8] # Define the hidden layers in the value head of the prediction network - self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network - - # Fully Connected Network - self.encoding_size = 32 - self.fc_representation_layers = [] # Define the hidden layers in the representation network - self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network - self.fc_reward_layers = [16] # Define the hidden layers in the reward network - self.fc_value_layers = [] # Define the hidden layers in the value network - self.fc_policy_layers = [] # Define the hidden layers in the policy network - - - - ### Training - self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs - self.save_model = True # Save the checkpoint in results_path as model.checkpoint - self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) - # self.training_steps = 50000 - self.batch_size = 64 # Number of parts of games to train on at each training step - self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing - self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) - self.train_on_gpu = torch.cuda.is_available() # Train on GPU if available - - self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD - self.weight_decay = 1e-4 # L2 weights regularization - self.momentum = 0.9 # Used only if optimizer is SGD - - # Exponential learning rate schedule - self.lr_init = 0.003 # Initial learning rate - self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate - self.lr_decay_steps = 10000 - - - - ### Replay Buffer - self.replay_buffer_size = 3000 # Number of self-play games to keep in the replay buffer - self.num_unroll_steps = 20 # Number of game moves to keep for every batch element - self.td_steps = 20 # Number of steps in the future to take into account for calculating the target value - self.PER = True # Prioritized Replay (See paper appendix Training), select in priority the elements in the replay buffer which are unexpected for the network - self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1 - - # Reanalyze (See paper appendix Reanalyse) - self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze) - self.reanalyse_on_gpu = False - - - - ### Adjust the self play / training ratio to avoid over/underfitting - self.self_play_delay = 0 # Number of seconds to wait after each played game - self.training_delay = 0 # Number of seconds to wait after each training step - self.ratio = None # Desired training steps per self played step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it - # fmt: on - - def visit_softmax_temperature_fn(self, trained_steps): - """ - Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. - The smaller it is, the more likely the best action (ie with the highest visit count) is chosen. - - Returns: - Positive float. - """ - return 1 - - -class Game(AbstractGame): - """ - Game wrapper. - """ - - def __init__(self, seed=None): - self.env = TicTacToe() - - def step(self, action): - """ - Apply action to the game. - - Args: - action : action of the action_space to take. - - Returns: - The new observation, the reward and a boolean if the game has ended. - """ - observation, reward, done = self.env.step(action) - return observation, reward * 20, done - - def to_play(self): - """ - Return the current player. - - Returns: - The current player, it should be an element of the players list in the config. - """ - return self.env.to_play() - - def legal_actions(self): - """ - Should return the legal actions at each turn, if it is not available, it can return - the whole action space. At each turn, the game have to be able to handle one of returned actions. - - For complex game where calculating legal moves is too long, the idea is to define the legal actions - equal to the action space but to return a negative reward if the action is illegal. - - Returns: - An array of integers, subset of the action space. - """ - return self.env.legal_actions() - - def reset(self): - """ - Reset the game for a new game. - - Returns: - Initial observation of the game. - """ - return self.env.reset() - - def render(self): - """ - Display the game observation. - """ - self.env.render() - input("Press enter to take a step ") - - def human_to_action(self): - """ - For multiplayer games, ask the user for a legal action - and return the corresponding action number. - - Returns: - An integer from the action space. - """ - while True: - try: - row = int( - input( - f"Enter the row (1, 2 or 3) to play for the player {self.to_play()}: " - ) - ) - col = int( - input( - f"Enter the column (1, 2 or 3) to play for the player {self.to_play()}: " - ) - ) - choice = (row - 1) * 3 + (col - 1) - if ( - choice in self.legal_actions() - and 1 <= row - and 1 <= col - and row <= 3 - and col <= 3 - ): - break - except: - pass - print("Wrong input, try again") - return choice - - def expert_agent(self): - """ - Hard coded agent that MuZero faces to assess his progress in multiplayer games. - It doesn't influence training - - Returns: - Action as an integer to take in the current game state - """ - return self.env.expert_action() - - def action_to_string(self, action_number): - """ - Convert an action number to a string representing the action. - - Args: - action_number: an integer from the action space. - - Returns: - String representing the action. - """ - row = action_number // 3 + 1 - col = action_number % 3 + 1 - return f"Play row {row}, column {col}" - - -class TicTacToe: - def __init__(self): - self.board = numpy.zeros((3, 3), dtype="int32") - self.player = 1 - - def to_play(self): - return 0 if self.player == 1 else 1 - - def reset(self): - self.board = numpy.zeros((3, 3), dtype="int32") - self.player = 1 - return self.get_observation() - - def step(self, action): - row = action // 3 - col = action % 3 - self.board[row, col] = self.player - - done = self.have_winner() or len(self.legal_actions()) == 0 - - reward = 1 if self.have_winner() else 0 - - self.player *= -1 - - return self.get_observation(), reward, done - - def get_observation(self): - board_player1 = numpy.where(self.board == 1, 1, 0) - board_player2 = numpy.where(self.board == -1, 1, 0) - board_to_play = numpy.full((3, 3), self.player) - return numpy.array([board_player1, board_player2, board_to_play], dtype="int32") - - def legal_actions(self): - legal = [] - for i in range(9): - row = i // 3 - col = i % 3 - if self.board[row, col] == 0: - legal.append(i) - return legal - - def have_winner(self): - # Horizontal and vertical checks - for i in range(3): - if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): - return True - - # Diagonal checks - if ( - self.board[0, 0] == self.player - and self.board[1, 1] == self.player - and self.board[2, 2] == self.player - ): - return True - if ( - self.board[2, 0] == self.player - and self.board[1, 1] == self.player - and self.board[0, 2] == self.player - ): - return True - - return False - - def expert_action(self): - board = self.board - action = numpy.random.choice(self.legal_actions()) - # Horizontal and vertical checks - for i in range(3): - if abs(sum(board[i, :])) == 2: - ind = numpy.where(board[i, :] == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([i]), numpy.array([ind])), (3, 3) - )[0] - if self.player * sum(board[i, :]) > 0: - return action - - if abs(sum(board[:, i])) == 2: - ind = numpy.where(board[:, i] == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([ind]), numpy.array([i])), (3, 3) - )[0] - if self.player * sum(board[:, i]) > 0: - return action - - # Diagonal checks - diag = board.diagonal() - anti_diag = numpy.fliplr(board).diagonal() - if abs(sum(diag)) == 2: - ind = numpy.where(diag == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([ind]), numpy.array([ind])), (3, 3) - )[0] - if self.player * sum(diag) > 0: - return action - - if abs(sum(anti_diag)) == 2: - ind = numpy.where(anti_diag == 0)[0][0] - action = numpy.ravel_multi_index( - (numpy.array([ind]), numpy.array([2 - ind])), (3, 3) - )[0] - if self.player * sum(anti_diag) > 0: - return action - - return action - - def render(self): - print(self.board[::-1]) diff --git a/muzero_2net.py b/muzero_2net.py index 642602da..fe9f6478 100644 --- a/muzero_2net.py +++ b/muzero_2net.py @@ -71,7 +71,6 @@ def __init__(self, game_name, config=None, split_resources_in=1): # 重命名路径,以便区分不同的模型 self.config.results_path /= "muzero_2net" - self.config.training_steps = 100000 # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) diff --git a/muzero_general.py b/muzero_general.py index 6d8363d9..b3fb9411 100644 --- a/muzero_general.py +++ b/muzero_general.py @@ -11,7 +11,7 @@ from simplifiedMuZero.without_rb.game_play import GamePlay from simplifiedMuZero.without_rb.play_buffer import PlayBuffer -from simplifiedMuZero.without_rb.trainer import Trainer +from simplifiedMuZero.without_rb.trainer_no_PV import Trainer from muzero import load_model_menu, hyperparameter_search import models @@ -61,6 +61,9 @@ def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save else: self.config = config + # using random search instand of MCTS + self.config.temperature_threshold = 0 + # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) diff --git a/simplifiedMuZero/without_rb/trainer_no_PV.py b/simplifiedMuZero/without_rb/trainer_no_PV.py new file mode 100644 index 00000000..265b13c5 --- /dev/null +++ b/simplifiedMuZero/without_rb/trainer_no_PV.py @@ -0,0 +1,243 @@ +import numpy +import torch +import models + +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, model_cls, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = model_cls(self.config) + # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + # if initial_checkpoint["optimizer_state"] is not None: + # print("Loading optimizer...\n") + # self.optimizer.load_state_dict( + # copy.deepcopy(initial_checkpoint["optimizer_state"]) + # ) + + # # update weights 与 continuous update weights 的区别 + # # 1. update weights 是实际计算更新network的权重 + # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + # next_batch = play_buffer.get_batch() + # # Training loop + # while self.training_step < self.config.training_steps and not terminate: + # index_batch, batch = next_batch + # next_batch = play_buffer.get_batch() + # self.update_lr() + # ( + # priorities, + # total_loss, + # value_loss, + # reward_loss, + # policy_loss, + # ) = self.update_weights(batch) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1) + + return value_loss, reward_loss, policy_loss From cbf40609a195ed58987640572f5d62c1c7e201f5 Mon Sep 17 00:00:00 2001 From: chunchangshao Date: Wed, 6 Sep 2023 12:40:50 +0100 Subject: [PATCH 9/9] synchronize modifications --- simplifiedMuZero/no_pv/trainer_no_pv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplifiedMuZero/no_pv/trainer_no_pv.py b/simplifiedMuZero/no_pv/trainer_no_pv.py index e4a6080c..f51e3ef8 100644 --- a/simplifiedMuZero/no_pv/trainer_no_pv.py +++ b/simplifiedMuZero/no_pv/trainer_no_pv.py @@ -251,7 +251,7 @@ def update_weights(self, batch): # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) # loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss - loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + loss = reward_loss + policy_loss if self.config.PER: # Correct PER bias by using importance-sampling (IS) weights loss *= weight_batch