diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/404.html b/404.html new file mode 100644 index 000000000..99765a54a --- /dev/null +++ b/404.html @@ -0,0 +1,658 @@ + + + +
+ + + + + + + + + + + + + + + + + ++ +
++ +
+or you can directly load AnimalShogi
class
Animal Shogi (Dōbutsu shōgi) is a variant of shogi primarily developed for children. It consists of a 3x4 board and four types of pieces (five including promoted pieces). One of the rule differences from regular shogi is the Try Rule, where entering the opponent's territory with the king leads to victory.
+See also Wikipedia
+Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +132 |
+
Observation shape | +(4, 3, 194) |
+
Observation type | +float |
+
Rewards | +{-1, 0, 1} |
+
Index | +Description | +
---|---|
[:, :, 0:5] |
+my pieces on board | +
[:, :, 5:10] |
+opponent's pieces on board | +
[:, :, 10:16] |
+my hands | +
[:, :, 16:22] |
+opponent's hands | +
[:, :, 22:24] |
+repetitions | +
... | +... | +
[:, :, 193] |
+player_id 's turn' |
+
[:, :, 194] |
+Elapsed timesteps (normalized to 1 ) |
+
Uses AlphaZero like action label:
+132
labels8 x 12
(direction) x (source square)3 x 12
(drop piece type) x (destination square)Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Draw | +0 |
+
Termination happens when
+In cases 3 and 4, the game is declared a draw.
+v0
: Initial release (v1.0.0)This is the list of all public APIs of Pgx.
+Two important components in Pgx are State
and Env
.
pgx.State
+
+
+
+ Bases: abc.ABC
Base state class of all Pgx game environments. Basically an immutable (frozen) dataclass.
+A basic usage is generating via Env.init
:
state = env.init(jax.random.PRNGKey(0))
+
and Env.step
receives and returns this state class:
state = env.step(state, action)
+
Serialization via flax.struct.serialization
is supported.
+There are 6 common attributes over all games:
Attributes:
+Name | +Type | +Description | +
---|---|---|
current_player |
+
+ jnp.ndarray
+ |
+ id of agent to play. +Note that this does NOT represent the turn (e.g., black/white in Go). +This ID is consistent over the parallel vmapped states. |
+
observation |
+
+ jnp.ndarray
+ |
+ observation for the current state.
+ |
+
rewards |
+
+ jnp.ndarray
+ |
+ the |
+
terminated |
+
+ jnp.ndarray
+ |
+ denotes that the state is terminal state. Note that
+some environments (e.g., Go) have an |
+
truncated |
+
+ jnp.ndarray
+ |
+ indicates that the episode ends with the reason other than termination.
+Note that current Pgx environments do not invoke truncation but users can use |
+
legal_action_mask |
+
+ jnp.ndarray
+ |
+ Boolean array of legal actions. If illegal action is taken, +the game will terminate immediately with the penalty to the palyer. |
+
pgx/v1.py
66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 |
|
env_id: EnvId
+
+
+ property
+ abstractmethod
+
+
+Environment id (e.g. "go_19x19")
+save_svg(filename, *, color_theme=None, scale=None)
+
+Save the entire state (not observation) to a file.
+The filename must end with .svg
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
color_theme |
+
+ Optional[Literal['light', 'dark']]
+ |
+ xxx see also global config. |
+
+ None
+ |
+
scale |
+
+ Optional[float]
+ |
+ change image size. Default(None) is 1.0 |
+
+ None
+ |
+
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+ None |
+
pgx/v1.py
to_svg(*, color_theme=None, scale=None)
+
+Return SVG string. Useful for visualization in notebook.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
color_theme |
+
+ Optional[Literal['light', 'dark']]
+ |
+ xxx see also global config. |
+
+ None
+ |
+
scale |
+
+ Optional[float]
+ |
+ change image size. Default(None) is 1.0 |
+
+ None
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
str |
+ str
+ |
+ SVG string |
+
pgx/v1.py
pgx.Env
+
+
+
+ Bases: abc.ABC
Environment class API.
+Example usage
+ +pgx/v1.py
164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 |
|
id: EnvId
+
+
+ property
+ abstractmethod
+
+
+Environment id.
+num_actions: int
+
+
+ property
+
+
+Return the size of action space (e.g., 9 in Tic-tac-toe)
+num_players: int
+
+
+ property
+ abstractmethod
+
+
+Number of players (e.g., 2 in Tic-tac-toe)
+observation_shape: Tuple[int, ...]
+
+
+ property
+
+
+Return the matrix shape of observation
+version: str
+
+
+ property
+ abstractmethod
+
+
+Environment version. Updated when behavior, parameter, or API is changed. +Refactoring or speeding up without any expected behavior changes will NOT update the version number.
+init(key)
+
+Return the initial state. Note that no internal state of +environment changes.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
key |
+
+ jax.random.KeyArray
+ |
+ pseudo-random generator key in JAX |
+ + required + | +
Returns:
+Name | Type | +Description | +
---|---|---|
State |
+ State
+ |
+ initial state of environment |
+
pgx/v1.py
observe(state, player_id)
+
+step(state, action)
+
+Step function.
+ +pgx/v1.py
pgx.EnvId = Literal['2048', 'animal_shogi', 'backgammon', 'bridge_bidding', 'chess', 'connect_four', 'gardner_chess', 'go_9x9', 'go_19x19', 'hex', 'kuhn_poker', 'leduc_holdem', 'minatar-asterix', 'minatar-breakout', 'minatar-freeway', 'minatar-seaquest', 'minatar-space_invaders', 'othello', 'shogi', 'sparrow_mahjong', 'tic_tac_toe']
+
+
+ module-attribute
+
+
+Naming convention of EnvId
Hyphen -
is used to represent that there is a different original game source (e.g., MinAtar
), and underscore -
is used for the other cases.
pgx.make(env_id)
+
+Load the specified environment.
+ +BridgeBidding
environment
BridgeBidding
environment requires the domain knowledge of bridge game.
+So we forbid users to load the bridge environment by make("bridge_bidding")
.
+Use BridgeBidding
class directly by from pgx.bridge_bidding import BridgeBidding
.
pgx/v1.py
328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 |
|
pgx.available_envs()
+
+List up all environment id available in pgx.make
function.
Example usage
+pgx.available_envs()
+('2048', 'animal_shogi', 'backgammon', 'chess', 'connect_four', 'go_9x9', 'go_19x19', 'hex', 'kuhn_poker', 'leduc_holdem', 'minatar-asterix', 'minatar-breakout', 'minatar-freeway', 'minatar-seaquest', 'minatar-space_invaders', 'othello', 'shogi', 'sparrow_mahjong', 'tic_tac_toe')
+
BridgeBidding
environment
BridgeBidding
environment requires the domain knowledge of bridge game.
+So we forbid users to load the bridge environment by make("bridge_bidding")
.
+Use BridgeBidding
class directly by from pgx.bridge_bidding import BridgeBidding
.
pgx/v1.py
pgx.set_visualization_config(*, color_theme='light', scale=1.0, frame_duration_seconds=0.2)
+
+pgx/_src/visualizer.py
pgx.save_svg(state, filename, *, color_theme=None, scale=None)
+
+pgx/_src/visualizer.py
pgx.save_svg_animation(states, filename, *, color_theme=None, scale=None, frame_duration_seconds=None)
+
+pgx/_src/visualizer.py
pgx.BaselineModelId = Literal['animal_shogi_v0', 'gardner_chess_v0', 'go_9x9_v0', 'hex_v0', 'othello_v0']
+
+
+ module-attribute
+
+
+pgx.make_baseline_model(model_id, download_dir='baselines')
+
+pgx/_src/baseline.py
import jax
+import jax.numpy as jnp
+import pgx
+
+seed = 42
+batch_size = 10
+key = jax.random.PRNGKey(seed)
+
+
+def act_randomly(rng_key, obs, mask):
+ """Ignore observation and choose randomly from legal actions"""
+ del obs
+ probs = mask / mask.sum()
+ logits = jnp.maximum(jnp.log(probs), jnp.finfo(probs.dtype).min)
+ return jax.random.categorical(rng_key, logits=logits, axis=-1)
+
+
+# Load the environment
+env = pgx.make("go_9x9")
+init_fn = jax.jit(jax.vmap(env.init))
+step_fn = jax.jit(jax.vmap(env.step))
+
+# Initialize the states
+key, subkey = jax.random.split(key)
+keys = jax.random.split(subkey, batch_size)
+state = init_fn(keys)
+
+# Run random simulation
+while not (state.terminated | state.truncated).all():
+ key, subkey = jax.random.split(key)
+ action = act_randomly(subkey, state.observation, state.legal_action_mask)
+ state = step_fn(state, action) # state.reward (2,)
+
This illustrative example helps to understand
+state.current_player
is definedEnv.step
behaves against already terminated statesimport jax
+import jax.numpy as jnp
+import pgx
+from pgx.experimental.utils import act_randomly
+
+seed = 42
+batch_size = 10
+key = jax.random.PRNGKey(seed)
+
+# Prepare agent A and B
+# Agent A: random player
+# Agent B: baseline player provided by Pgx
+A = 0
+B = 1
+
+# Load the environment
+env = pgx.make("go_9x9")
+init_fn = jax.jit(jax.vmap(env.init))
+step_fn = jax.jit(jax.vmap(env.step))
+
+# Prepare baseline model
+# Note that it additionaly requires Haiku library ($ pip install dm-haiku)
+model_id = "go_9x9_v0"
+model = pgx.make_baseline_model(model_id)
+
+# Initialize the states
+key, subkey = jax.random.split(key)
+keys = jax.random.split(subkey, batch_size)
+state = init_fn(keys)
+print(f"Game index: {jnp.arange(batch_size)}") # [0 1 2 3 4 5 6 7 8 9]
+print(f"Black player: {state.current_player}") # [1 1 0 1 0 0 1 1 1 1]
+# In other words
+print(f"A is black: {state.current_player == A}") # [False False True False True True False False False False]
+print(f"B is black: {state.current_player == B}") # [ True True False True False False True True True True]
+
+# Run simulation
+R = state.rewards
+while not (state.terminated | state.truncated).all():
+ # Action of random player A
+ key, subkey = jax.random.split(key)
+ action_A = jax.jit(act_randomly)(subkey, state)
+ # Greedy action of baseline model B
+ logits, value = model(state.observation)
+ action_B = logits.argmax(axis=-1)
+
+ action = jnp.where(state.current_player == A, action_A, action_B)
+ state = step_fn(state, action)
+ R += state.rewards
+
+print(f"Return of agent A = {R[:, A]}") # [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
+print(f"Return of agent B = {R[:, B]}") # [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
+
Note that we can avoid to explicitly deal with the first batch dimension like [:, A]
by using vmap
later.
+ +
++ +
+or you can directly load Backgammon
class
++Backgammon ...
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +162 (= 6 * 26 + 6) |
+
Observation shape | +(34,) |
+
Observation type | +int |
+
Rewards | +{-3, -2, -1, 0, 1, 2, 3} |
+
The first 28
observation dimensions follow [Antonoglou+22]
:
++An action in our implementation consists of 4 micro-actions, the same as the maximum number +of dice a player can play at each turn. Each micro-action encodes the source position of a chip +along with the value of the die used. We consider 26 possible source positions, with the 0-th position corresponding to a no-op, the 1st to retrieving a chip from the hit pile, and the remaining to selecting a chip in one of the 24 possible points. Each micro-action is encoded as a single integer with micro-action =
+src · 6 + die
.
Index | +Description | +
---|---|
[:24] |
+represents | +
[24:28] |
+represents | +
[28:34] |
+is one-hot vector of playable dice | +
...
+...
+...
+v0
: Initial release (v1.0.0)[Antonoglou+22]
"Planning in Stochastic Environments with a Learned Modell", ICLR+ +
++ +
+TBA
+ + + + + + ++ +
++ +
+or you can directly load Chess
class
TBA
+TBA
+Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +4672 |
+
Observation shape | +(8, 8, 119) |
+
Observation type | +float |
+
Rewards | +{-1, 0, 1} |
+
We follow the observation design of AlphaZero [Silver+18]
.
Index | +Description | +
---|---|
TBA | +TBA | +
TBA
+Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Draw | +0 |
+
Termination occurs when one of the following conditions are satisfied:
+50
halfmoves are elapsed without any captures or pawn moves512
steps are elapsed (from AlphaZero [Silver+18]
)v1
: Bug fix when castling by @HongruiTang in #983 (v1.1.0) v0
: Initial release (v1.0.0)[Silver+18]
"A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play" Science+ +
++ +
+or you can directly load ConnectFour
class
++Connect Four is a two-player connection rack game, in which the players choose a color and then take turns dropping colored tokens into a seven-column, six-row vertically suspended grid. The pieces fall straight down, occupying the lowest available space within the column. The objective of the game is to be the first to form a horizontal, vertical, or diagonal line of four of one's own tokens.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +7 |
+
Observation shape | +(6, 7, 2) |
+
Observation type | +bool |
+
Rewards | +{-1, 0, 1} |
+
Index | +Description | +
---|---|
[:, :, 0] |
+represents (6, 7) squares filled by the current player |
+
[:, :, 1] |
+represents (6, 7) squares filled by the opponent player of current player |
+
Each action represents the column index the player drops the token to.
+Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Draw | +0 |
+
Termination happens when
+42 (= 6 x 7)
squares are filled.v0
: Initial release (v1.0.0)+ +
++ +
+or you can directly load GardnerChess
class
TBA
+TBA
+Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +1225 |
+
Observation shape | +(5, 5, 115) |
+
Observation type | +float |
+
Rewards | +{-1, 0, 1} |
+
We follow the observation design of AlphaZero [Silver+18]
.
Index | +Description | +
---|---|
TBA | +TBA | +
TBA
+Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Draw | +0 |
+
Termination occurs when one of the following conditions are satisfied:
+50
halfmoves are elapsed without any captures or pawn moves256
steps are elapsed (512
in full-size chess experiments in AlphaZero [Silver+18]
)v0
: Initial release (v1.0.0)[Silver+18]
"A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play" Science + +
++ +
+or you can directly load Go
class
++Go is an abstract strategy board game for two players in which the aim is to surround more territory than the opponent. The game was invented in China more than 2,500 years ago and is believed to be the oldest board game continuously played to the present day.
+ +
The rule implemented in Pgx follows Tromp-Taylor Rules.
+Komi
+By default, we use 6.5
. Users can set different komi
at Go
class constructor.
Ko
+On PSK implementations.
+Tromp-Taylor rule employ PSK. However, implementing strict PSK is inefficient because
+As PSK rarely happens, as far as our best knowledge, it is usual to compromise in PSK implementations. +For example,
+Note that the strict rule is "PSK for legal actions, and PSK action leads to immediate lose." +So, we also compromise at this point, our approach is
+Anyway, we believe it's effect is very small as PSK rarely happens, especially in 19x19 board.
+Let N
be the board size (e.g., 19
).
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +N x N + 1 |
+
Observation shape | +(N, N, 17) |
+
Observation type | +bool |
+
Rewards | +{-1, 1} |
+
We follow the observation design of AlphaGo Zero [Silver+17]
.
Index | +Description | +
---|---|
obs[:, :, 0] |
+stones of player_id (@ current board) |
+
obs[:, :, 1] |
+stones of player_id 's opponent (@ current board) |
+
obs[:, :, 2] |
+stones of player_id (@ 1-step before) |
+
obs[:, :, 3] |
+stones of player_id 's opponent (@ 1-step before) |
+
... | +... | +
obs[:, :, -1] |
+color of player_id |
+
Final observation dimension
+For the final dimension, there are two possible options:
+player_id
This ambiguity happens because observe
function is available even if player_id
is different from state.current_player
.
+In AlphaGo Zero paper [Silver+17]
, the final dimension C is explained as:
++The final feature plane, C, represents the colour to play, and has a constant value of either 1 if black + is to play or 0 if white is to play.
+
however, it also describes as
+++the colour feature C is necessary because the komi is not observable.
+
So, we use player_id's color to let the agent know komi information.
+As long as it's called when player_id == state.current_player
, this doesn't matter.
Each action ({0, ..., N * N - 1}
) represents the point to be colored.
+The final action represents pass action.
Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Termination happens when
+N * N * 2
steps are elapsed [Silver+17]
.v0
: Initial release (v1.0.0)[Silver+17]
"Mastering the game of go without human knowledge" Nature+ +
++ +
+or you can directly load Hex
class
++Hex is a two player abstract strategy board game in which players attempt to connect opposite sides of a rhombus-shaped board made of hexagonal cells. Hex was invented by mathematician and poet Piet Hein in 1942 and later rediscovered and popularized by John Nash.
+ +
As the first player to move has a distinct advantage, the swap rule is used to compensate for this. +The detailed swap rule used in Pgx follows swap pieces:
+++"Swap pieces": The players perform the swap by switching pieces. This means the initial red piece is replaced by a blue piece in the mirror image position, where the mirroring takes place with respect to the board's long diagonal. For example, a red piece at a3 becomes a blue piece at c1. The players do not switch colours: Red stays Red and Blue stays Blue. After the swap, it is Red's turn.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +122 (= 11 x 11) + 1 |
+
Observation shape | +(11, 11, 3) |
+
Observation type | +bool |
+
Rewards | +{-1, 1} |
+
Index | +Description | +
---|---|
[:, :, 0] |
+represents (11, 11) cells filled by player_ix |
+
[:, :, 1] |
+represents (11, 11) cells filled by the opponent player of player_id |
+
[:, :, 2] |
+represents whether player_id is black or white |
+
Each action ({0, ... 120}
) represents the cell index to be filled.
+The final action 121
is the swap action available only at the second turn.
Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Note that there is no draw in Hex.
+Termination happens when either one player connect opposite sides of the board.
+v0
: Initial release (v1.0.0)+ +
+ ++ +
+ +import jax
+import pgx
+
+env = pgx.make("go_19x19")
+init = jax.jit(jax.vmap(env.init))
+step = jax.jit(jax.vmap(env.step))
+
+batch_size = 1024
+keys = jax.random.split(jax.random.PRNGKey(42), batch_size)
+state = init(keys) # vectorized states
+while not (state.terminated | state.truncated).all():
+ action = model(state.current_player, state.observation, state.legal_action_mask)
+ state = step(state, action) # state.reward (2,)
+
+ +
++ +
+Kuhn poker is a simplified poker with three cards: J, Q, and K.
+Each player is dealt one card and the remaining card is unused. +There are four actions: check, call, bet, and fold and five possible scenarios.
+bet (1st) - call (2nd)
: Showdown and the winner takes +2
bet (1st) - fold (2nd)
: 1st player takes +1
check (1st) - check (2nd)
: Showdown and the winner takes +1
check (1st) - bet (2nd) - call (1st)
: Showdown and the winner takes +2
check (1st) - bet (2nd) - fold (1st)
: 2nd takes +1
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +4 |
+
Observation shape | +(7,) |
+
Observation type | +bool |
+
Rewards | +{-2, -1, +1, +2} |
+
Index | +Description | +
---|---|
[0] |
+One if J in my hand | +
[1] |
+One if Q in my hand | +
[2] |
+One if K in my hand | +
[3] |
+One if 0 chip is bet by me | +
[4] |
+One if 1 chip is bet by me | +
[5] |
+One if 0 chip of the opponent | +
[6] |
+One if 1 chip of the opponent | +
There are four distinct actions.
+Action | +Index | +
---|---|
Call | +0 | +
Bet | +1 | +
Fold | +2 | +
Check | +3 | +
The winner takes +2
or +1
depending on the game payoff.
+As Kuhn poker is zero-sum game, the loser takes -2
or -1
respectively.
Follows the rules above.
+v0
: Initial release (v1.0.0)+ +
++ +
+Leduc hold’em is a simplified poker proposed in [Souhty+05].
+We quote the description in [Souhty+05]:
+++Leduc Hold ’Em. We have also constructed a smaller +version of hold ’em, which seeks to retain the strategic elements of the large game while keeping the size of the game +tractable. In Leduc hold ’em, the deck consists of two suits +with three cards in each suit. There are two rounds. In the +first round a single private card is dealt to each player. In +the second round a single board card is revealed. There is +a two-bet maximum, with raise amounts of 2 and 4 in the +first and second round, respectively. Both players start the +first round with 1 already in the pot.
+ +Figure 1: An example decision tree for a single betting +round in poker with a two-bet maximum. Leaf nodes with +open boxes continue to the next round, while closed boxes +end the hand.
+
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +4 |
+
Observation shape | +(7,) |
+
Observation type | +bool |
+
Rewards | +{-13, -12, ... 0, ..., 12, 13} |
+
Index | +Description | +
---|---|
[0] |
+True if J in hand | +
[1] |
+True if Q in hand | +
[2] |
+True if K in hand | +
[3] |
+True if J is the public card | +
[4] |
+True if J is the public card | +
[5] |
+True if J is the public card | +
[6:19] |
+represent my chip count (0, ..., 13) | +
[20:33] |
+represent opponent's chip count (0, ..., 13) | +
There are four distinct actions.
+Index | +Action | +
---|---|
0 | +Call | +
1 | +Raise | +
2 | +Fold | +
The reward is the payoff of the game.
+Follows the rules above.
+v0
: Initial release (v1.0.0)+ +
+ +Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
Then, you can use the environment as follows:
+ +MinAtar is originally proposed by [Young&Tian+19]
.
+The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Asterix environment is described as follows:
++The player can move freely along the 4 cardinal directions. Enemies and treasure spawn from the sides. A reward of ++1 is given for picking up treasure. Termination occurs if the player makes contact with an enemy. Enemy and +treasure direction are indicated by a trail channel. Difficulty is periodically increased by increasing the speed +and spawn rate of enemies and treasure.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +1 |
+
Number of actions | +5 |
+
Observation shape | +(10, 10, 4) |
+
Observation type | +bool |
+
Rewards | +{0, 1} |
+
Index | +Channel | +
---|---|
[:, :, 0] |
+Player | +
[:, :, 1] |
+Enemy | +
[:, :, 2] |
+Trail | +
[:, :, 3] |
+Gold | +
TBA
+v0
: Initial release (v1.0.0)[Young&Tian+19]
"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
+ + + + + + ++ +
+ +Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
Then, you can use the environment as follows:
+ +MinAtar is originally proposed by [Young&Tian+19]
.
+The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Breakout environment is described as follows:
++The player controls a paddle on the bottom of the screen and must bounce a ball tobreak 3 rows of bricks along the +top of the screen. A reward of +1 is given for each brick broken by the ball. When all bricks are cleared another 3 +rows are added. The ball travels only along diagonals, when it hits the paddle it is bounced either to the left or +right depending on the side of the paddle hit, when it hits a wall or brick it is reflected. Termination occurs when +the ball hits the bottom of the screen. The balls direction is indicated by a trail channel.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +1 |
+
Number of actions | +3 |
+
Observation shape | +(10, 10, 4) |
+
Observation type | +bool |
+
Rewards | +{0, 1} |
+
Index | +Channel | +
---|---|
[:, :, 0] |
+Paddle | +
[:, :, 1] |
+Ball | +
[:, :, 2] |
+Trail | +
[:, :, 3] |
+Brick | +
TBA
+v0
: Initial release (v1.0.0)[Young&Tian+19]
"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
+ + + + + + ++ +
+ +Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
Then, you can use the environment as follows:
+ +MinAtar is originally proposed by [Young&Tian+19]
.
+The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Freeway environment is described as follows:
++The player begins at the bottom of the screen and motion is restricted to traveling up and down. Player speed is +also restricted such that the player can only move every 3 frames. A reward of +1 is given when the player reaches +the top of the screen, at which point the player is returned to the bottom. Cars travel horizontally on the screen +and teleport to the other side when the edge is reached. When hit by a car, the player is returned to the bottom of +the screen. Car direction and speed is indicated by 5 trail channels, the location of the trail gives direction +while the specific channel indicates how frequently the car moves (from once every frame to once every 5 frames). +Each time the player successfully reaches the top of the screen, the car speeds are randomized. Termination occurs +after 2500 frames have elapsed.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +1 |
+
Number of actions | +3 |
+
Observation shape | +(10, 10, 7) |
+
Observation type | +bool |
+
Rewards | +{0, 1} |
+
Index | +Channel | +
---|---|
[:, :, 0] |
+Chicken | +
[:, :, 1] |
+Car | +
[:, :, 2] |
+Speed 1 | +
[:, :, 3] |
+Speed 2 | +
[:, :, 4] |
+Speed 3 | +
[:, :, 5] |
+Speed 4 | +
TBA
+v0
: Initial release (v1.0.0)[Young&Tian+19]
"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
+ + + + + + ++ +
+ +Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
Then, you can use the environment as follows:
+ +MinAtar is originally proposed by [Young&Tian+19]
.
+The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Seaquest environment is described as follows:
++The player controls a submarine consisting of two cells, front and back, to allow direction to be determined. The +player can also fire bullets from the front of the submarine. Enemies consist of submarines and fish, distinguished +by the fact that submarines shoot bullets and fish do not. A reward of +1 is given each time an enemy is struck by +one of the player's bullets, at which point the enemy is also removed. There are also divers which the player can +move onto to pick up, doing so increments a bar indicated by another channel along the bottom of the screen. The +player also has a limited supply of oxygen indicated by another bar in another channel. Oxygen degrades over time, +and is replenished whenever the player moves to the top of the screen as long as the player has at least one rescued +diver on board. The player can carry a maximum of 6 divers. When surfacing with less than 6, one diver is removed. +When surfacing with 6, all divers are removed and a reward is given for each active cell in the oxygen bar. Each +time the player surfaces the difficulty is increased by increasing the spawn rate and movement speed of enemies. +Termination occurs when the player is hit by an enemy fish, sub or bullet; or when oxygen reached 0; or when the +player attempts to surface with no rescued divers. Enemy and diver directions are indicated by a trail channel +active in their previous location to reduce partial observability.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +1 |
+
Number of actions | +6 |
+
Observation shape | +(10, 10, 10) |
+
Observation type | +bool |
+
Rewards | +{0, 1, ..., 10} |
+
Index | +Channel | +
---|---|
[:, :, 0] |
+Player submarine (front) | +
[:, :, 1] |
+Player submarine (back) | +
[:, :, 2] |
+Friendly bullet | +
[:, :, 3] |
+Trail | +
[:, :, 4] |
+Enemy bullet | +
[:, :, 5] |
+Enemy fish | +
[:, :, 6] |
+Enemy submarine | +
[:, :, 7] |
+Oxygen guage | +
[:, :, 8] |
+Diver guage | +
[:, :, 9] |
+Diver | +
TBA
+v0
: Initial release (v1.0.0)[Young&Tian+19]
"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
+ + + + + + ++ +
+ +Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
Then, you can use the environment as follows:
+ +MinAtar is originally proposed by [Young&Tian+19]
.
+The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Space Invaders environment is described as follows:
++The player controls a cannon at the bottom of the screen and can shoot bullets upward at a cluster of aliens above. +The aliens move across the screen until one of them hits the edge, at which point they all move down and switch +directions. The current alien direction is indicated by 2 channels (one for left and one for right) one of which is +active at the location of each alien. A reward of +1 is given each time an alien is shot, and that alien is also +removed. The aliens will also shoot bullets back at the player. When few aliens are left, alien speed will begin to +increase. When only one alien is left, it will move at one cell per frame. When a wave of aliens is fully cleared a +new one will spawn which moves at a slightly faster speed than the last. Termination occurs when an alien or bullet +hits the player.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +1 |
+
Number of actions | +4 |
+
Observation shape | +(10, 10, 6) |
+
Observation type | +bool |
+
Rewards | +{0, 1} |
+
Index | +Channel | +
---|---|
[:, :, 0] |
+Cannon | +
[:, :, 1] |
+Alien | +
[:, :, 2] |
+Alien left | +
[:, :, 3] |
+Alien right | +
[:, :, 4] |
+Friendly bullet | +
[:, :, 5] |
+Enemy bullet | +
TBA
+v0
: Initial release (v1.0.0)[Young&Tian+19]
"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
+ + + + + + ++ +
++ +
+or you can directly load Othello
class
++Othello, or differing in not having a defined starting position, Reversi, is a two-player zero-sum and perfect information abstract strategy board game, usually played on a board with 8 rows and 8 columns and a set of light and a dark turnable pieces for each side. The player's goal is to have a majority of their colored pieces showing at the end of the game, turning over as many of their opponent's pieces as possible. The dark player makes the first move from the starting position, alternating with the light player. Each player has to place a piece on the board such that there exists at least one straight (horizontal, vertical, or diagonal) occupied line of opponent pieces between the new piece and another own piece. After placing the piece, the side turns over (flips, captures) all opponent pieces lying on any straight lines between the new piece and any anchoring own pieces.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +65 (= 8 x 8 + 1) |
+
Observation shape | +(8, 8, 2) |
+
Observation type | +bool |
+
Rewards | +{-1, 0, 1} |
+
Index | +Description | +
---|---|
[:, :, 0] |
+represents (8, 8) squares colored by the current player |
+
[:, :, 1] |
+represents (8, 8) squares colored by the opponent player of current player |
+
Each action ({0, ..., 63}
) represents the square index to be filled. The last 64
-th action represents pass action.
Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Draw | +0 |
+
Termination happens when all 64 (= 8 x 8)
playable squares are filled.
v0
: Initial release (v1.0.0)+ +
++ +
+or you can directly load Play2048
class
++2048 ...
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +1 |
+
Number of actions | +4 |
+
Observation shape | +(4, 4, 31) |
+
Observation type | +bool |
+
Rewards | +{0, 2, 4, ...} |
+
Our obseervation design basically follows [Antonoglou+22]
:
++In our 2048 experiments we used a binary representation of the observation as an input to our model. +Specifically, the 4 × 4 board was flattened into a single vector of size 16, and a binary representation +of 31 bits for each number was obtained, for a total size of 496 numbers.
+
However, instaead of 496
-d flat vector, we employ (4, 4, 31)
vector.
Index | +Description | +
---|---|
[i, j, b] |
+represents that square (i, j) has a tile of 2 ^ b if b > 0 |
+
Each action corresnponds to 0 (left)
, 1 (up)
, 2 (right)
, 3 (down)
.
Sum of merged tiles.
+If all squares are filled with tiles and no legal actions are available, the game terminates.
+v0
: Initial release (v1.0.0)[Antonoglou+22]
"Planning in Stochastic Environments with a Learned Modell", ICLRimport jax\nimport pgx\nenv = pgx.make(\"go_19x19\")\ninit = jax.jit(jax.vmap(env.init))\nstep = jax.jit(jax.vmap(env.step))\nbatch_size = 1024\nkeys = jax.random.split(jax.random.PRNGKey(42), batch_size)\nstate = init(keys) # vectorized states\nwhile not (state.terminated | state.truncated).all():\naction = model(state.current_player, state.observation, state.legal_action_mask)\nstate = step(state, action) # state.reward (2,)\n
"},{"location":"animal_shogi/","title":"AnimalShogi","text":"darklight
"},{"location":"animal_shogi/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"animal_shogi\")\n
or you can directly load AnimalShogi
class
from pgx.animal_shogi import AnimalShogi\nenv = AnimalShogi()\n
"},{"location":"animal_shogi/#description","title":"Description","text":"Animal Shogi (D\u014dbutsu sh\u014dgi) is a variant of shogi primarily developed for children. It consists of a 3x4 board and four types of pieces (five including promoted pieces). One of the rule differences from regular shogi is the Try Rule, where entering the opponent's territory with the king leads to victory.
See also Wikipedia
"},{"location":"animal_shogi/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 132
Observation shape (4, 3, 194)
Observation type float
Rewards {-1, 0, 1}
"},{"location":"animal_shogi/#observation","title":"Observation","text":"Index Description [:, :, 0:5]
my pieces on board [:, :, 5:10]
opponent's pieces on board [:, :, 10:16]
my hands [:, :, 16:22]
opponent's hands [:, :, 22:24]
repetitions ... ... [:, :, 193]
player_id
's turn' [:, :, 194]
Elapsed timesteps (normalized to 1
)"},{"location":"animal_shogi/#action","title":"Action","text":"Uses AlphaZero like action label:
132
labels8 x 12
(direction) x (source square)3 x 12
(drop piece type) x (destination square)Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Draw 0
"},{"location":"animal_shogi/#termination","title":"Termination","text":"Termination happens when
In cases 3 and 4, the game is declared a draw.
"},{"location":"animal_shogi/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)This is the list of all public APIs of Pgx. Two important components in Pgx are State
and Env
.
Naming convention of EnvId
Hyphen -
is used to represent that there is a different original game source (e.g., MinAtar
), and underscore -
is used for the other cases.
pgx.State
","text":" Bases: abc.ABC
Base state class of all Pgx game environments. Basically an immutable (frozen) dataclass. A basic usage is generating via Env.init
:
state = env.init(jax.random.PRNGKey(0))\n
and Env.step
receives and returns this state class:
state = env.step(state, action)\n
Serialization via flax.struct.serialization
is supported. There are 6 common attributes over all games:
Attributes:
Name Type Descriptioncurrent_player
jnp.ndarray
id of agent to play. Note that this does NOT represent the turn (e.g., black/white in Go). This ID is consistent over the parallel vmapped states.
observation
jnp.ndarray
observation for the current state. Env.observe
is called to compute.
rewards
jnp.ndarray
the i
-th element indicates the intermediate reward for the agent with player-id i
. If Env.step
is called for a terminal state, the following state.rewards
is zero for all players.
terminated
jnp.ndarray
denotes that the state is terminal state. Note that some environments (e.g., Go) have an max_termination_steps
parameter inside and will terminate within a limited number of states (following AlphaGo).
truncated
jnp.ndarray
indicates that the episode ends with the reason other than termination. Note that current Pgx environments do not invoke truncation but users can use TimeLimit
wrapper to truncate the environment. In Pgx environments, some MinAtar games may not terminate within a finite timestep. However, the other environments are supposed to terminate within a finite timestep with probability one.
legal_action_mask
jnp.ndarray
Boolean array of legal actions. If illegal action is taken, the game will terminate immediately with the penalty to the palyer.
Source code inpgx/v1.py
@dataclass\nclass State(abc.ABC):\n\"\"\"Base state class of all Pgx game environments. Basically an immutable (frozen) dataclass.\n A basic usage is generating via `Env.init`:\n state = env.init(jax.random.PRNGKey(0))\n and `Env.step` receives and returns this state class:\n state = env.step(state, action)\n Serialization via `flax.struct.serialization` is supported.\n There are 6 common attributes over all games:\n Attributes:\n current_player (jnp.ndarray): id of agent to play.\n Note that this does NOT represent the turn (e.g., black/white in Go).\n This ID is consistent over the parallel vmapped states.\n observation (jnp.ndarray): observation for the current state.\n `Env.observe` is called to compute.\n rewards (jnp.ndarray): the `i`-th element indicates the intermediate reward for\n the agent with player-id `i`. If `Env.step` is called for a terminal state,\n the following `state.rewards` is zero for all players.\n terminated (jnp.ndarray): denotes that the state is terminal state. Note that\n some environments (e.g., Go) have an `max_termination_steps` parameter inside\n and will terminate within a limited number of states (following AlphaGo).\n truncated (jnp.ndarray): indicates that the episode ends with the reason other than termination.\n Note that current Pgx environments do not invoke truncation but users can use `TimeLimit` wrapper\n to truncate the environment. In Pgx environments, some MinAtar games may not terminate within a finite timestep.\n However, the other environments are supposed to terminate within a finite timestep with probability one.\n legal_action_mask (jnp.ndarray): Boolean array of legal actions. If illegal action is taken,\n the game will terminate immediately with the penalty to the palyer.\n \"\"\"\ncurrent_player: jnp.ndarray\nobservation: jnp.ndarray\nrewards: jnp.ndarray\nterminated: jnp.ndarray\ntruncated: jnp.ndarray\nlegal_action_mask: jnp.ndarray\n# NOTE: _rng_key is\n# - used for stochastic env and auto reset\n# - updated only when actually used\n# - supposed NOT to be used by agent\n_rng_key: jax.random.KeyArray\n_step_count: jnp.ndarray\n@property\n@abc.abstractmethod\ndef env_id(self) -> EnvId:\n\"\"\"Environment id (e.g. \"go_19x19\")\"\"\"\n...\ndef _repr_html_(self) -> str:\nreturn self.to_svg()\ndef to_svg(\nself,\n*,\ncolor_theme: Optional[Literal[\"light\", \"dark\"]] = None,\nscale: Optional[float] = None,\n) -> str:\n\"\"\"Return SVG string. Useful for visualization in notebook.\n Args:\n color_theme (Optional[Literal[\"light\", \"dark\"]]): xxx see also global config.\n scale (Optional[float]): change image size. Default(None) is 1.0\n Returns:\n str: SVG string\n \"\"\"\nfrom pgx._src.visualizer import Visualizer\nv = Visualizer(color_theme=color_theme, scale=scale)\nreturn v.get_dwg(states=self).tostring()\ndef save_svg(\nself,\nfilename,\n*,\ncolor_theme: Optional[Literal[\"light\", \"dark\"]] = None,\nscale: Optional[float] = None,\n) -> None:\n\"\"\"Save the entire state (not observation) to a file.\n The filename must end with `.svg`\n Args:\n color_theme (Optional[Literal[\"light\", \"dark\"]]): xxx see also global config.\n scale (Optional[float]): change image size. Default(None) is 1.0\n Returns:\n None\n \"\"\"\nfrom pgx._src.visualizer import save_svg\nsave_svg(self, filename, color_theme=color_theme, scale=scale)\n
"},{"location":"api/#pgx.v1.State.env_id","title":"env_id: EnvId
property
abstractmethod
","text":"Environment id (e.g. \"go_19x19\")
"},{"location":"api/#pgx.v1.State.save_svg","title":"save_svg(filename, *, color_theme=None, scale=None)
","text":"Save the entire state (not observation) to a file. The filename must end with .svg
Parameters:
Name Type Description Defaultcolor_theme
Optional[Literal['light', 'dark']]
xxx see also global config.
None
scale
Optional[float]
change image size. Default(None) is 1.0
None
Returns:
Type DescriptionNone
None
Source code inpgx/v1.py
def save_svg(\nself,\nfilename,\n*,\ncolor_theme: Optional[Literal[\"light\", \"dark\"]] = None,\nscale: Optional[float] = None,\n) -> None:\n\"\"\"Save the entire state (not observation) to a file.\n The filename must end with `.svg`\n Args:\n color_theme (Optional[Literal[\"light\", \"dark\"]]): xxx see also global config.\n scale (Optional[float]): change image size. Default(None) is 1.0\n Returns:\n None\n \"\"\"\nfrom pgx._src.visualizer import save_svg\nsave_svg(self, filename, color_theme=color_theme, scale=scale)\n
"},{"location":"api/#pgx.v1.State.to_svg","title":"to_svg(*, color_theme=None, scale=None)
","text":"Return SVG string. Useful for visualization in notebook.
Parameters:
Name Type Description Defaultcolor_theme
Optional[Literal['light', 'dark']]
xxx see also global config.
None
scale
Optional[float]
change image size. Default(None) is 1.0
None
Returns:
Name Type Descriptionstr
str
SVG string
Source code inpgx/v1.py
def to_svg(\nself,\n*,\ncolor_theme: Optional[Literal[\"light\", \"dark\"]] = None,\nscale: Optional[float] = None,\n) -> str:\n\"\"\"Return SVG string. Useful for visualization in notebook.\n Args:\n color_theme (Optional[Literal[\"light\", \"dark\"]]): xxx see also global config.\n scale (Optional[float]): change image size. Default(None) is 1.0\n Returns:\n str: SVG string\n \"\"\"\nfrom pgx._src.visualizer import Visualizer\nv = Visualizer(color_theme=color_theme, scale=scale)\nreturn v.get_dwg(states=self).tostring()\n
"},{"location":"api/#pgx.Env","title":"pgx.Env
","text":" Bases: abc.ABC
Environment class API.
Example usage
env: Env = pgx.make(\"tic_tac_toe\")\nstate = env.init(jax.random.PRNGKey(0))\naction = jax.random.int32(4)\nstate = env.step(state, action)\n
Source code in pgx/v1.py
class Env(abc.ABC):\n\"\"\"Environment class API.\n !!! example \"Example usage\"\n ```py\n env: Env = pgx.make(\"tic_tac_toe\")\n state = env.init(jax.random.PRNGKey(0))\n action = jax.random.int32(4)\n state = env.step(state, action)\n ```\n \"\"\"\ndef __init__(self):\n...\ndef init(self, key: jax.random.KeyArray) -> State:\n\"\"\"Return the initial state. Note that no internal state of\n environment changes.\n Args:\n key: pseudo-random generator key in JAX\n Returns:\n State: initial state of environment\n \"\"\"\nkey, subkey = jax.random.split(key)\nstate = self._init(subkey)\nstate = state.replace(_rng_key=key) # type: ignore\nobservation = self.observe(state, state.current_player)\nreturn state.replace(observation=observation) # type: ignore\ndef step(self, state: State, action: jnp.ndarray) -> State:\n\"\"\"Step function.\"\"\"\nis_illegal = ~state.legal_action_mask[action]\ncurrent_player = state.current_player\n# If the state is already terminated or truncated, environment does not take usual step,\n# but return the same state with zero-rewards for all players\nstate = jax.lax.cond(\n(state.terminated | state.truncated),\nlambda: state.replace(rewards=jnp.zeros_like(state.rewards)), # type: ignore\nlambda: self._step(state.replace(_step_count=state._step_count + 1), action), # type: ignore\n)\n# Taking illegal action leads to immediate game terminal with negative reward\nstate = jax.lax.cond(\nis_illegal,\nlambda: self._step_with_illegal_action(state, current_player),\nlambda: state,\n)\n# All legal_action_mask elements are **TRUE** at terminal state\n# This is to avoid zero-division error when normalizing action probability\n# Taking any action at terminal state does not give any effect to the state\nstate = jax.lax.cond(\nstate.terminated,\nlambda: state.replace( # type: ignore\nlegal_action_mask=jnp.ones_like(state.legal_action_mask)\n),\nlambda: state,\n)\nobservation = self.observe(state, state.current_player)\nstate = state.replace(observation=observation) # type: ignore\nreturn state\ndef observe(self, state: State, player_id: jnp.ndarray) -> jnp.ndarray:\n\"\"\"Observation function.\"\"\"\nobs = self._observe(state, player_id)\nreturn jax.lax.stop_gradient(obs)\n@abc.abstractmethod\ndef _init(self, key: jax.random.KeyArray) -> State:\n\"\"\"Implement game-specific init function here.\"\"\"\n...\n@abc.abstractmethod\ndef _step(self, state, action) -> State:\n\"\"\"Implement game-specific step function here.\"\"\"\n...\n@abc.abstractmethod\ndef _observe(self, state: State, player_id: jnp.ndarray) -> jnp.ndarray:\n\"\"\"Implement game-specific observe function here.\"\"\"\n...\n@property\n@abc.abstractmethod\ndef id(self) -> EnvId:\n\"\"\"Environment id.\"\"\"\n...\n@property\n@abc.abstractmethod\ndef version(self) -> str:\n\"\"\"Environment version. Updated when behavior, parameter, or API is changed.\n Refactoring or speeding up without any expected behavior changes will NOT update the version number.\n \"\"\"\n...\n@property\n@abc.abstractmethod\ndef num_players(self) -> int:\n\"\"\"Number of players (e.g., 2 in Tic-tac-toe)\"\"\"\n...\n@property\ndef num_actions(self) -> int:\n\"\"\"Return the size of action space (e.g., 9 in Tic-tac-toe)\"\"\"\nstate = self.init(jax.random.PRNGKey(0))\nreturn int(state.legal_action_mask.shape[0])\n@property\ndef observation_shape(self) -> Tuple[int, ...]:\n\"\"\"Return the matrix shape of observation\"\"\"\nstate = self.init(jax.random.PRNGKey(0))\nobs = self._observe(state, state.current_player)\nreturn obs.shape\n@property\ndef _illegal_action_penalty(self) -> float:\n\"\"\"Negative reward given when illegal action is selected.\"\"\"\nreturn -1.0\ndef _step_with_illegal_action(\nself, state: State, loser: jnp.ndarray\n) -> State:\npenalty = self._illegal_action_penalty\nreward = (\njnp.ones_like(state.rewards)\n* (-1 * penalty)\n* (self.num_players - 1)\n)\nreward = reward.at[loser].set(penalty)\nreturn state.replace(rewards=reward, terminated=TRUE) # type: ignore\n
"},{"location":"api/#pgx.v1.Env.id","title":"id: EnvId
property
abstractmethod
","text":"Environment id.
"},{"location":"api/#pgx.v1.Env.num_actions","title":"num_actions: int
property
","text":"Return the size of action space (e.g., 9 in Tic-tac-toe)
"},{"location":"api/#pgx.v1.Env.num_players","title":"num_players: int
property
abstractmethod
","text":"Number of players (e.g., 2 in Tic-tac-toe)
"},{"location":"api/#pgx.v1.Env.observation_shape","title":"observation_shape: Tuple[int, ...]
property
","text":"Return the matrix shape of observation
"},{"location":"api/#pgx.v1.Env.version","title":"version: str
property
abstractmethod
","text":"Environment version. Updated when behavior, parameter, or API is changed. Refactoring or speeding up without any expected behavior changes will NOT update the version number.
"},{"location":"api/#pgx.v1.Env.init","title":"init(key)
","text":"Return the initial state. Note that no internal state of environment changes.
Parameters:
Name Type Description Defaultkey
jax.random.KeyArray
pseudo-random generator key in JAX
requiredReturns:
Name Type DescriptionState
State
initial state of environment
Source code inpgx/v1.py
def init(self, key: jax.random.KeyArray) -> State:\n\"\"\"Return the initial state. Note that no internal state of\n environment changes.\n Args:\n key: pseudo-random generator key in JAX\n Returns:\n State: initial state of environment\n \"\"\"\nkey, subkey = jax.random.split(key)\nstate = self._init(subkey)\nstate = state.replace(_rng_key=key) # type: ignore\nobservation = self.observe(state, state.current_player)\nreturn state.replace(observation=observation) # type: ignore\n
"},{"location":"api/#pgx.v1.Env.observe","title":"observe(state, player_id)
","text":"Observation function.
Source code inpgx/v1.py
def observe(self, state: State, player_id: jnp.ndarray) -> jnp.ndarray:\n\"\"\"Observation function.\"\"\"\nobs = self._observe(state, player_id)\nreturn jax.lax.stop_gradient(obs)\n
"},{"location":"api/#pgx.v1.Env.step","title":"step(state, action)
","text":"Step function.
Source code inpgx/v1.py
def step(self, state: State, action: jnp.ndarray) -> State:\n\"\"\"Step function.\"\"\"\nis_illegal = ~state.legal_action_mask[action]\ncurrent_player = state.current_player\n# If the state is already terminated or truncated, environment does not take usual step,\n# but return the same state with zero-rewards for all players\nstate = jax.lax.cond(\n(state.terminated | state.truncated),\nlambda: state.replace(rewards=jnp.zeros_like(state.rewards)), # type: ignore\nlambda: self._step(state.replace(_step_count=state._step_count + 1), action), # type: ignore\n)\n# Taking illegal action leads to immediate game terminal with negative reward\nstate = jax.lax.cond(\nis_illegal,\nlambda: self._step_with_illegal_action(state, current_player),\nlambda: state,\n)\n# All legal_action_mask elements are **TRUE** at terminal state\n# This is to avoid zero-division error when normalizing action probability\n# Taking any action at terminal state does not give any effect to the state\nstate = jax.lax.cond(\nstate.terminated,\nlambda: state.replace( # type: ignore\nlegal_action_mask=jnp.ones_like(state.legal_action_mask)\n),\nlambda: state,\n)\nobservation = self.observe(state, state.current_player)\nstate = state.replace(observation=observation) # type: ignore\nreturn state\n
"},{"location":"api/#pgx.EnvId","title":"pgx.EnvId = Literal['2048', 'animal_shogi', 'backgammon', 'bridge_bidding', 'chess', 'connect_four', 'gardner_chess', 'go_9x9', 'go_19x19', 'hex', 'kuhn_poker', 'leduc_holdem', 'minatar-asterix', 'minatar-breakout', 'minatar-freeway', 'minatar-seaquest', 'minatar-space_invaders', 'othello', 'shogi', 'sparrow_mahjong', 'tic_tac_toe']
module-attribute
","text":""},{"location":"api/#pgx.make","title":"pgx.make(env_id)
","text":"Load the specified environment.
Example usage
env = pgx.make(\"tic_tac_toe\")\n
BridgeBidding
environment
BridgeBidding
environment requires the domain knowledge of bridge game. So we forbid users to load the bridge environment by make(\"bridge_bidding\")
. Use BridgeBidding
class directly by from pgx.bridge_bidding import BridgeBidding
.
pgx/v1.py
def make(env_id: EnvId): # noqa: C901\n\"\"\"Load the specified environment.\n !!! example \"Example usage\"\n ```py\n env = pgx.make(\"tic_tac_toe\")\n ```\n !!! note \"`BridgeBidding` environment\"\n `BridgeBidding` environment requires the domain knowledge of bridge game.\n So we forbid users to load the bridge environment by `make(\"bridge_bidding\")`.\n Use `BridgeBidding` class directly by `from pgx.bridge_bidding import BridgeBidding`.\n \"\"\"\n# NOTE: BridgeBidding environment requires the domain knowledge of bridge\n# So we forbid users to load the bridge environment by `make(\"bridge_bidding\")`.\nif env_id == \"2048\":\nfrom pgx.play2048 import Play2048\nreturn Play2048()\nelif env_id == \"animal_shogi\":\nfrom pgx.animal_shogi import AnimalShogi\nreturn AnimalShogi()\nelif env_id == \"backgammon\":\nfrom pgx.backgammon import Backgammon\nreturn Backgammon()\nelif env_id == \"chess\":\nfrom pgx.chess import Chess\nreturn Chess()\nelif env_id == \"connect_four\":\nfrom pgx.connect_four import ConnectFour\nreturn ConnectFour()\nelif env_id == \"gardner_chess\":\nfrom pgx.gardner_chess import GardnerChess\nreturn GardnerChess()\nelif env_id == \"go_9x9\":\nfrom pgx.go import Go\nreturn Go(size=9, komi=7.5)\nelif env_id == \"go_19x19\":\nfrom pgx.go import Go\nreturn Go(size=19, komi=7.5)\nelif env_id == \"hex\":\nfrom pgx.hex import Hex\nreturn Hex()\nelif env_id == \"kuhn_poker\":\nfrom pgx.kuhn_poker import KuhnPoker\nreturn KuhnPoker()\nelif env_id == \"leduc_holdem\":\nfrom pgx.leduc_holdem import LeducHoldem\nreturn LeducHoldem()\nelif env_id == \"minatar-asterix\":\ntry:\nfrom pgx_minatar.asterix import MinAtarAsterix # type: ignore\nreturn MinAtarAsterix()\nexcept ModuleNotFoundError:\nprint(\n'\"minatar-asterix\" environment is provided as a separate plugin of Pgx.\\nPlease run `$ pip install pgx-minatar` to use this environment in Pgx.',\nfile=sys.stderr,\n)\nsys.exit(1)\nelif env_id == \"minatar-breakout\":\ntry:\nfrom pgx_minatar.breakout import MinAtarBreakout # type: ignore\nreturn MinAtarBreakout()\nexcept ModuleNotFoundError:\nprint(\n'\"minatar-breakout\" environment is provided as a separate plugin of Pgx.\\nPlease run `$ pip install pgx-minatar` to use this environment in Pgx.',\nfile=sys.stderr,\n)\nsys.exit(1)\nelif env_id == \"minatar-freeway\":\ntry:\nfrom pgx_minatar.freeway import MinAtarFreeway # type: ignore\nreturn MinAtarFreeway()\nexcept ModuleNotFoundError:\nprint(\n'\"minatar-freeway\" environment is provided as a separate plugin of Pgx.\\nPlease run `$ pip install pgx-minatar` to use this environment in Pgx.',\nfile=sys.stderr,\n)\nsys.exit(1)\nelif env_id == \"minatar-seaquest\":\ntry:\nfrom pgx_minatar.seaquest import MinAtarSeaquest # type: ignore\nreturn MinAtarSeaquest()\nexcept ModuleNotFoundError:\nprint(\n'\"minatar-seaquest\" environment is provided as a separate plugin of Pgx.\\nPlease run `$ pip install pgx-minatar` to use this environment in Pgx.',\nfile=sys.stderr,\n)\nsys.exit(1)\nelif env_id == \"minatar-space_invaders\":\ntry:\nfrom pgx_minatar.space_invaders import ( # type: ignore\nMinAtarSpaceInvaders,\n)\nreturn MinAtarSpaceInvaders()\nexcept ModuleNotFoundError:\nprint(\n'\"minatar-space_invaders\" environment is provided as a separate plugin of Pgx.\\nPlease run `$ pip install pgx-minatar` to use this environment in Pgx.',\nfile=sys.stderr,\n)\nsys.exit(1)\nelif env_id == \"othello\":\nfrom pgx.othello import Othello\nreturn Othello()\nelif env_id == \"shogi\":\nfrom pgx.shogi import Shogi\nreturn Shogi()\nelif env_id == \"sparrow_mahjong\":\nfrom pgx.sparrow_mahjong import SparrowMahjong\nreturn SparrowMahjong()\nelif env_id == \"tic_tac_toe\":\nfrom pgx.tic_tac_toe import TicTacToe\nreturn TicTacToe()\nelse:\nenvs = \"\\n\".join(available_envs())\nraise ValueError(\nf\"Wrong env_id '{env_id}' is passed. Available ids are: \\n{envs}\"\n)\n
"},{"location":"api/#pgx.available_envs","title":"pgx.available_envs()
","text":"List up all environment id available in pgx.make
function.
Example usage
pgx.available_envs()\n('2048', 'animal_shogi', 'backgammon', 'chess', 'connect_four', 'go_9x9', 'go_19x19', 'hex', 'kuhn_poker', 'leduc_holdem', 'minatar-asterix', 'minatar-breakout', 'minatar-freeway', 'minatar-seaquest', 'minatar-space_invaders', 'othello', 'shogi', 'sparrow_mahjong', 'tic_tac_toe')\n
BridgeBidding
environment
BridgeBidding
environment requires the domain knowledge of bridge game. So we forbid users to load the bridge environment by make(\"bridge_bidding\")
. Use BridgeBidding
class directly by from pgx.bridge_bidding import BridgeBidding
.
pgx/v1.py
def available_envs() -> Tuple[EnvId, ...]:\n\"\"\"List up all environment id available in `pgx.make` function.\n !!! example \"Example usage\"\n ```py\n pgx.available_envs()\n ('2048', 'animal_shogi', 'backgammon', 'chess', 'connect_four', 'go_9x9', 'go_19x19', 'hex', 'kuhn_poker', 'leduc_holdem', 'minatar-asterix', 'minatar-breakout', 'minatar-freeway', 'minatar-seaquest', 'minatar-space_invaders', 'othello', 'shogi', 'sparrow_mahjong', 'tic_tac_toe')\n ```\n !!! note \"`BridgeBidding` environment\"\n `BridgeBidding` environment requires the domain knowledge of bridge game.\n So we forbid users to load the bridge environment by `make(\"bridge_bidding\")`.\n Use `BridgeBidding` class directly by `from pgx.bridge_bidding import BridgeBidding`.\n \"\"\"\ngames = get_args(EnvId)\ngames = tuple(filter(lambda x: x != \"bridge_bidding\", games))\nreturn games\n
"},{"location":"api/#pgx.set_visualization_config","title":"pgx.set_visualization_config(*, color_theme='light', scale=1.0, frame_duration_seconds=0.2)
","text":"Source code in pgx/_src/visualizer.py
def set_visualization_config(\n*,\ncolor_theme: ColorTheme = \"light\",\nscale: float = 1.0,\nframe_duration_seconds: float = 0.2,\n):\nglobal_config.color_theme = color_theme\nglobal_config.scale = scale\nglobal_config.frame_duration_seconds = frame_duration_seconds\n
"},{"location":"api/#pgx.save_svg","title":"pgx.save_svg(state, filename, *, color_theme=None, scale=None)
","text":"Source code in pgx/_src/visualizer.py
def save_svg(\nstate: State,\nfilename: Union[str, Path],\n*,\ncolor_theme: Optional[Literal[\"light\", \"dark\"]] = None,\nscale: Optional[float] = None,\n) -> None:\nassert str(filename).endswith(\".svg\")\nif state.env_id.startswith(\"minatar\"):\nstate.save_svg(filename=filename)\nelse:\nv = Visualizer(color_theme=color_theme, scale=scale)\nv.get_dwg(states=state).saveas(filename)\n
"},{"location":"api/#pgx.save_svg_animation","title":"pgx.save_svg_animation(states, filename, *, color_theme=None, scale=None, frame_duration_seconds=None)
","text":"Source code in pgx/_src/visualizer.py
def save_svg_animation(\nstates: Sequence[State],\nfilename: Union[str, Path],\n*,\ncolor_theme: Optional[Literal[\"light\", \"dark\"]] = None,\nscale: Optional[float] = None,\nframe_duration_seconds: Optional[float] = None,\n) -> None:\nassert not states[0].env_id.startswith(\n\"minatar\"\n), \"MinAtar does not support svg animation.\"\nassert str(filename).endswith(\".svg\")\nv = Visualizer(color_theme=color_theme, scale=scale)\nif frame_duration_seconds is None:\nframe_duration_seconds = global_config.frame_duration_seconds\nframe_groups = []\ndwg = None\nfor i, state in enumerate(states):\ndwg = v.get_dwg(states=state)\nassert (\nlen(\n[\ne\nfor e in dwg.elements\nif type(e) == svgwrite.container.Group\n]\n)\n== 1\n), \"Drawing must contain only one group\"\ngroup: svgwrite.container.Group = dwg.elements[-1]\ngroup[\"id\"] = f\"_fr{i:x}\" # hex frame number\ngroup[\"class\"] = \"frame\"\nframe_groups.append(group)\nassert dwg is not None\ndel dwg.elements[-1]\ntotal_seconds = frame_duration_seconds * len(frame_groups)\nstyle = f\".frame{{visibility:hidden; animation:{total_seconds}s linear _k infinite;}}\"\nstyle += f\"@keyframes _k{{0%,{100/len(frame_groups)}%{{visibility:visible}}{100/len(frame_groups) * 1.000001}%,100%{{visibility:hidden}}}}\"\nfor i, group in enumerate(frame_groups):\ndwg.add(group)\nstyle += (\nf\"#{group['id']}{{animation-delay:{i * frame_duration_seconds}s}}\"\n)\ndwg.defs.add(svgwrite.container.Style(content=style))\ndwg.saveas(filename)\n
"},{"location":"api/#pgx.BaselineModelId","title":"pgx.BaselineModelId = Literal['animal_shogi_v0', 'gardner_chess_v0', 'go_9x9_v0', 'hex_v0', 'othello_v0']
module-attribute
","text":""},{"location":"api/#pgx.make_baseline_model","title":"pgx.make_baseline_model(model_id, download_dir='baselines')
","text":"Source code in pgx/_src/baseline.py
def make_baseline_model(\nmodel_id: BaselineModelId, download_dir: str = \"baselines\"\n):\nimport haiku as hk\ncreate_model_fn = _make_create_model_fn(model_id)\nmodel_args, model_params, model_state = _load_baseline_model(\nmodel_id, download_dir\n)\ndef forward_fn(x, is_eval=False):\nnet = create_model_fn(**model_args)\npolicy_out, value_out = net(\nx, is_training=not is_eval, test_local_stats=False\n)\nreturn policy_out, value_out\nforward = hk.without_apply_rng(hk.transform_with_state(forward_fn))\ndef apply(obs):\n(logits, value), _ = forward.apply(\nmodel_params, model_state, obs, is_eval=True\n)\nreturn logits, value\nreturn apply\n
"},{"location":"api/#pgx.v1_api_test","title":"pgx.v1_api_test(env, num=100)
","text":"Source code in pgx/_src/api_test.py
def v1_api_test(env: Env, num: int = 100):\napi_test_single(env, num)\napi_test_batch(env, num)\n
"},{"location":"api_usage/","title":"Pgx API Usage","text":""},{"location":"api_usage/#example1-random-play","title":"Example.1: Random play","text":"import jax\nimport jax.numpy as jnp\nimport pgx\nseed = 42\nbatch_size = 10\nkey = jax.random.PRNGKey(seed)\ndef act_randomly(rng_key, obs, mask):\n\"\"\"Ignore observation and choose randomly from legal actions\"\"\"\ndel obs\nprobs = mask / mask.sum()\nlogits = jnp.maximum(jnp.log(probs), jnp.finfo(probs.dtype).min)\nreturn jax.random.categorical(rng_key, logits=logits, axis=-1)\n# Load the environment\nenv = pgx.make(\"go_9x9\")\ninit_fn = jax.jit(jax.vmap(env.init))\nstep_fn = jax.jit(jax.vmap(env.step))\n# Initialize the states\nkey, subkey = jax.random.split(key)\nkeys = jax.random.split(subkey, batch_size)\nstate = init_fn(keys)\n# Run random simulation\nwhile not (state.terminated | state.truncated).all():\nkey, subkey = jax.random.split(key)\naction = act_randomly(subkey, state.observation, state.legal_action_mask)\nstate = step_fn(state, action) # state.reward (2,)\n
"},{"location":"api_usage/#example2-random-agent-vs-baseline-model","title":"Example.2: Random agent vs Baseline model","text":"This illustrative example helps to understand
state.current_player
is definedEnv.step
behaves against already terminated statesimport jax\nimport jax.numpy as jnp\nimport pgx\nfrom pgx.experimental.utils import act_randomly\nseed = 42\nbatch_size = 10\nkey = jax.random.PRNGKey(seed)\n# Prepare agent A and B\n# Agent A: random player\n# Agent B: baseline player provided by Pgx\nA = 0\nB = 1\n# Load the environment\nenv = pgx.make(\"go_9x9\")\ninit_fn = jax.jit(jax.vmap(env.init))\nstep_fn = jax.jit(jax.vmap(env.step))\n# Prepare baseline model\n# Note that it additionaly requires Haiku library ($ pip install dm-haiku)\nmodel_id = \"go_9x9_v0\"\nmodel = pgx.make_baseline_model(model_id)\n# Initialize the states\nkey, subkey = jax.random.split(key)\nkeys = jax.random.split(subkey, batch_size)\nstate = init_fn(keys)\nprint(f\"Game index: {jnp.arange(batch_size)}\") # [0 1 2 3 4 5 6 7 8 9]\nprint(f\"Black player: {state.current_player}\") # [1 1 0 1 0 0 1 1 1 1]\n# In other words\nprint(f\"A is black: {state.current_player == A}\") # [False False True False True True False False False False]\nprint(f\"B is black: {state.current_player == B}\") # [ True True False True False False True True True True]\n# Run simulation\nR = state.rewards\nwhile not (state.terminated | state.truncated).all():\n# Action of random player A\nkey, subkey = jax.random.split(key)\naction_A = jax.jit(act_randomly)(subkey, state)\n# Greedy action of baseline model B\nlogits, value = model(state.observation)\naction_B = logits.argmax(axis=-1)\naction = jnp.where(state.current_player == A, action_A, action_B)\nstate = step_fn(state, action)\nR += state.rewards\nprint(f\"Return of agent A = {R[:, A]}\") # [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]\nprint(f\"Return of agent B = {R[:, B]}\") # [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n
Note that we can avoid to explicitly deal with the first batch dimension like [:, A]
by using vmap
later.
"},{"location":"backgammon/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"backgammon\")\n
or you can directly load Backgammon
class
from pgx.backgammon import Backgammon\nenv = Backgammon()\n
"},{"location":"backgammon/#description","title":"Description","text":"Backgammon ...
Wikipedia
"},{"location":"backgammon/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 162 (= 6 * 26 + 6)
Observation shape (34,)
Observation type int
Rewards {-3, -2, -1, 0, 1, 2, 3}
"},{"location":"backgammon/#observation","title":"Observation","text":"The first 28
observation dimensions follow [Antonoglou+22]
:
An action in our implementation consists of 4 micro-actions, the same as the maximum number of dice a player can play at each turn. Each micro-action encodes the source position of a chip along with the value of the die used. We consider 26 possible source positions, with the 0-th position corresponding to a no-op, the 1st to retrieving a chip from the hit pile, and the remaining to selecting a chip in one of the 24 possible points. Each micro-action is encoded as a single integer with micro-action = src \u00b7 6 + die
.
[:24]
represents [24:28]
represents [28:34]
is one-hot vector of playable dice"},{"location":"backgammon/#action","title":"Action","text":"...
"},{"location":"backgammon/#rewards","title":"Rewards","text":"...
"},{"location":"backgammon/#termination","title":"Termination","text":"...
"},{"location":"backgammon/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)[Antonoglou+22]
\"Planning in Stochastic Environments with a Learned Modell\", ICLR
"},{"location":"bridge_bidding/#description","title":"Description","text":"
TBA
"},{"location":"chess/","title":"Chess","text":"darklight
"},{"location":"chess/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"chess\")\n
or you can directly load Chess
class
from pgx.chess import Chess\nenv = Chess()\n
"},{"location":"chess/#description","title":"Description","text":"TBA
"},{"location":"chess/#rules","title":"Rules","text":"TBA
"},{"location":"chess/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 4672
Observation shape (8, 8, 119)
Observation type float
Rewards {-1, 0, 1}
"},{"location":"chess/#observation","title":"Observation","text":"We follow the observation design of AlphaZero [Silver+18]
.
TBA
"},{"location":"chess/#rewards","title":"Rewards","text":"Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Draw 0
"},{"location":"chess/#termination","title":"Termination","text":"Termination occurs when one of the following conditions are satisfied:
50
halfmoves are elapsed without any captures or pawn moves512
steps are elapsed (from AlphaZero [Silver+18]
)v1
: Bug fix when castling by @HongruiTang in #983 (v1.1.0) v0
: Initial release (v1.0.0)[Silver+18]
\"A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play\" Science
"},{"location":"connect_four/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"connect_four\")\n
or you can directly load ConnectFour
class
from pgx.connect_four import ConnectFour\nenv = ConnectFour()\n
"},{"location":"connect_four/#description","title":"Description","text":"Connect Four is a two-player connection rack game, in which the players choose a color and then take turns dropping colored tokens into a seven-column, six-row vertically suspended grid. The pieces fall straight down, occupying the lowest available space within the column. The objective of the game is to be the first to form a horizontal, vertical, or diagonal line of four of one's own tokens.
Wikipedia
"},{"location":"connect_four/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 7
Observation shape (6, 7, 2)
Observation type bool
Rewards {-1, 0, 1}
"},{"location":"connect_four/#observation","title":"Observation","text":"Index Description [:, :, 0]
represents (6, 7)
squares filled by the current player [:, :, 1]
represents (6, 7)
squares filled by the opponent player of current player"},{"location":"connect_four/#action","title":"Action","text":"Each action represents the column index the player drops the token to.
"},{"location":"connect_four/#rewards","title":"Rewards","text":"Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Draw 0
"},{"location":"connect_four/#termination","title":"Termination","text":"Termination happens when
42 (= 6 x 7)
squares are filled.v0
: Initial release (v1.0.0)
"},{"location":"gardner_chess/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"gardner_chess\")\n
or you can directly load GardnerChess
class
from pgx.gardner_chess import GardnerChess\nenv = GardnerChess()\n
"},{"location":"gardner_chess/#description","title":"Description","text":"TBA
"},{"location":"gardner_chess/#rules","title":"Rules","text":"TBA
"},{"location":"gardner_chess/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 1225
Observation shape (5, 5, 115)
Observation type float
Rewards {-1, 0, 1}
"},{"location":"gardner_chess/#observation","title":"Observation","text":"We follow the observation design of AlphaZero [Silver+18]
.
TBA
"},{"location":"gardner_chess/#rewards","title":"Rewards","text":"Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Draw 0
"},{"location":"gardner_chess/#termination","title":"Termination","text":"Termination occurs when one of the following conditions are satisfied:
50
halfmoves are elapsed without any captures or pawn moves256
steps are elapsed (512
in full-size chess experiments in AlphaZero [Silver+18]
)v0
: Initial release (v1.0.0)[Silver+18]
\"A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play\" Science
"},{"location":"go/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"go_19x19\") # or \"go_9x9\"\n
or you can directly load Go
class
from pgx.go import Go\nenv = Go(size=19, komi=6.5)\n
"},{"location":"go/#description","title":"Description","text":"Go is an abstract strategy board game for two players in which the aim is to surround more territory than the opponent. The game was invented in China more than 2,500 years ago and is believed to be the oldest board game continuously played to the present day.
Wikipedia
"},{"location":"go/#rules","title":"Rules","text":"The rule implemented in Pgx follows Tromp-Taylor Rules.
Komi
By default, we use 6.5
. Users can set different komi
at Go
class constructor.
Ko
On PSK implementations.
Tromp-Taylor rule employ PSK. However, implementing strict PSK is inefficient because
As PSK rarely happens, as far as our best knowledge, it is usual to compromise in PSK implementations. For example,
Note that the strict rule is \"PSK for legal actions, and PSK action leads to immediate lose.\" So, we also compromise at this point, our approach is
Anyway, we believe it's effect is very small as PSK rarely happens, especially in 19x19 board.
"},{"location":"go/#specs","title":"Specs","text":"Let N
be the board size (e.g., 19
).
v0
Number of players 2
Number of actions N x N + 1
Observation shape (N, N, 17)
Observation type bool
Rewards {-1, 1}
"},{"location":"go/#observation","title":"Observation","text":"We follow the observation design of AlphaGo Zero [Silver+17]
.
obs[:, :, 0]
stones of player_id
(@ current board) obs[:, :, 1]
stones of player_id
's opponent (@ current board) obs[:, :, 2]
stones of player_id
(@ 1-step before) obs[:, :, 3]
stones of player_id
's opponent (@ 1-step before) ... ... obs[:, :, -1]
color of player_id
Final observation dimension
For the final dimension, there are two possible options:
player_id
This ambiguity happens because observe
function is available even if player_id
is different from state.current_player
. In AlphaGo Zero paper [Silver+17]
, the final dimension C is explained as:
The final feature plane, C, represents the colour to play, and has a constant value of either 1 if black is to play or 0 if white is to play.
however, it also describes as
the colour feature C is necessary because the komi is not observable.
So, we use player_id's color to let the agent know komi information. As long as it's called when player_id == state.current_player
, this doesn't matter.
Each action ({0, ..., N * N - 1}
) represents the point to be colored. The final action represents pass action.
Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
"},{"location":"go/#termination","title":"Termination","text":"Termination happens when
N * N * 2
steps are elapsed [Silver+17]
.v0
: Initial release (v1.0.0)[Silver+17]
\"Mastering the game of go without human knowledge\" Nature
"},{"location":"hex/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"hex\")\n
or you can directly load Hex
class
from pgx.hex import Hex\nenv = Hex()\n
"},{"location":"hex/#description","title":"Description","text":"Hex is a two player abstract strategy board game in which players attempt to connect opposite sides of a rhombus-shaped board made of hexagonal cells. Hex was invented by mathematician and poet Piet Hein in 1942 and later rediscovered and popularized by John Nash.
Wikipedia
"},{"location":"hex/#rules","title":"Rules","text":"As the first player to move has a distinct advantage, the swap rule is used to compensate for this. The detailed swap rule used in Pgx follows swap pieces:
\"Swap pieces\": The players perform the swap by switching pieces. This means the initial red piece is replaced by a blue piece in the mirror image position, where the mirroring takes place with respect to the board's long diagonal. For example, a red piece at a3 becomes a blue piece at c1. The players do not switch colours: Red stays Red and Blue stays Blue. After the swap, it is Red's turn.
Hex Wiki - Swap rule
"},{"location":"hex/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 122 (= 11 x 11) + 1
Observation shape (11, 11, 3)
Observation type bool
Rewards {-1, 1}
"},{"location":"hex/#observation","title":"Observation","text":"Index Description [:, :, 0]
represents (11, 11)
cells filled by player_ix
[:, :, 1]
represents (11, 11)
cells filled by the opponent player of player_id
[:, :, 2]
represents whether player_id
is black or white"},{"location":"hex/#action","title":"Action","text":"Each action ({0, ... 120}
) represents the cell index to be filled. The final action 121
is the swap action available only at the second turn.
Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Note that there is no draw in Hex.
"},{"location":"hex/#termination","title":"Termination","text":"Termination happens when either one player connect opposite sides of the board.
"},{"location":"hex/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)
"},{"location":"kuhn_poker/#description","title":"Description","text":"
Kuhn poker is a simplified poker with three cards: J, Q, and K.
"},{"location":"kuhn_poker/#rules","title":"Rules","text":"Each player is dealt one card and the remaining card is unused. There are four actions: check, call, bet, and fold and five possible scenarios.
bet (1st) - call (2nd)
: Showdown and the winner takes +2
bet (1st) - fold (2nd)
: 1st player takes +1
check (1st) - check (2nd)
: Showdown and the winner takes +1
check (1st) - bet (2nd) - call (1st)
: Showdown and the winner takes +2
check (1st) - bet (2nd) - fold (1st)
: 2nd takes +1
v0
Number of players 2
Number of actions 4
Observation shape (7,)
Observation type bool
Rewards {-2, -1, +1, +2}
"},{"location":"kuhn_poker/#observation","title":"Observation","text":"Index Description [0]
One if J in my hand [1]
One if Q in my hand [2]
One if K in my hand [3]
One if 0 chip is bet by me [4]
One if 1 chip is bet by me [5]
One if 0 chip of the opponent [6]
One if 1 chip of the opponent"},{"location":"kuhn_poker/#action","title":"Action","text":"There are four distinct actions.
Action Index Call 0 Bet 1 Fold 2 Check 3"},{"location":"kuhn_poker/#rewards","title":"Rewards","text":"The winner takes +2
or +1
depending on the game payoff. As Kuhn poker is zero-sum game, the loser takes -2
or -1
respectively.
Follows the rules above.
"},{"location":"kuhn_poker/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)
"},{"location":"leduc_holdem/#description","title":"Description","text":"
Leduc hold\u2019em is a simplified poker proposed in [Souhty+05].
"},{"location":"leduc_holdem/#rules","title":"Rules","text":"We quote the description in [Souhty+05]:
Leduc Hold \u2019Em. We have also constructed a smaller version of hold \u2019em, which seeks to retain the strategic elements of the large game while keeping the size of the game tractable. In Leduc hold \u2019em, the deck consists of two suits with three cards in each suit. There are two rounds. In the first round a single private card is dealt to each player. In the second round a single board card is revealed. There is a two-bet maximum, with raise amounts of 2 and 4 in the first and second round, respectively. Both players start the first round with 1 already in the pot.
Figure 1: An example decision tree for a single betting round in poker with a two-bet maximum. Leaf nodes with open boxes continue to the next round, while closed boxes end the hand.
"},{"location":"leduc_holdem/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 4
Observation shape (7,)
Observation type bool
Rewards {-13, -12, ... 0, ..., 12, 13}
"},{"location":"leduc_holdem/#observation","title":"Observation","text":"Index Description [0]
True if J in hand [1]
True if Q in hand [2]
True if K in hand [3]
True if J is the public card [4]
True if J is the public card [5]
True if J is the public card [6:19]
represent my chip count (0, ..., 13) [20:33]
represent opponent's chip count (0, ..., 13)"},{"location":"leduc_holdem/#action","title":"Action","text":"There are four distinct actions.
Index Action 0 Call 1 Raise 2 Fold"},{"location":"leduc_holdem/#rewards","title":"Rewards","text":"The reward is the payoff of the game.
"},{"location":"leduc_holdem/#termination","title":"Termination","text":"Follows the rules above.
"},{"location":"leduc_holdem/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
pip install pgx-minatar\n
Then, you can use the environment as follows:
import pgx\nenv = pgx.make(\"minatar-asterix\")\n
"},{"location":"minatar_asterix/#description","title":"Description","text":"MinAtar is originally proposed by [Young&Tian+19]
. The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Asterix environment is described as follows:
The player can move freely along the 4 cardinal directions. Enemies and treasure spawn from the sides. A reward of +1 is given for picking up treasure. Termination occurs if the player makes contact with an enemy. Enemy and treasure direction are indicated by a trail channel. Difficulty is periodically increased by increasing the speed and spawn rate of enemies and treasure.
github.com/kenjyoung/MinAtar - asterix.py
"},{"location":"minatar_asterix/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 1
Number of actions 5
Observation shape (10, 10, 4)
Observation type bool
Rewards {0, 1}
"},{"location":"minatar_asterix/#observation","title":"Observation","text":"Index Channel [:, :, 0]
Player [:, :, 1]
Enemy [:, :, 2]
Trail [:, :, 3]
Gold"},{"location":"minatar_asterix/#action","title":"Action","text":"TBA
"},{"location":"minatar_asterix/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)[Young&Tian+19]
\"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments\" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
"},{"location":"minatar_breakout/","title":"MinAtar Breakout","text":""},{"location":"minatar_breakout/#usage","title":"Usage","text":"Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
pip install pgx-minatar\n
Then, you can use the environment as follows:
import pgx\nenv = pgx.make(\"minatar-breakout\")\n
"},{"location":"minatar_breakout/#description","title":"Description","text":"MinAtar is originally proposed by [Young&Tian+19]
. The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Breakout environment is described as follows:
The player controls a paddle on the bottom of the screen and must bounce a ball tobreak 3 rows of bricks along the top of the screen. A reward of +1 is given for each brick broken by the ball. When all bricks are cleared another 3 rows are added. The ball travels only along diagonals, when it hits the paddle it is bounced either to the left or right depending on the side of the paddle hit, when it hits a wall or brick it is reflected. Termination occurs when the ball hits the bottom of the screen. The balls direction is indicated by a trail channel.
github.com/kenjyoung/MinAtar - breakout.py
"},{"location":"minatar_breakout/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 1
Number of actions 3
Observation shape (10, 10, 4)
Observation type bool
Rewards {0, 1}
"},{"location":"minatar_breakout/#observation","title":"Observation","text":"Index Channel [:, :, 0]
Paddle [:, :, 1]
Ball [:, :, 2]
Trail [:, :, 3]
Brick"},{"location":"minatar_breakout/#action","title":"Action","text":"TBA
"},{"location":"minatar_breakout/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)[Young&Tian+19]
\"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments\" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
"},{"location":"minatar_freeway/","title":"MinAtar Freeway","text":""},{"location":"minatar_freeway/#usage","title":"Usage","text":"Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
pip install pgx-minatar\n
Then, you can use the environment as follows:
import pgx\nenv = pgx.make(\"minatar-freeway\")\n
"},{"location":"minatar_freeway/#description","title":"Description","text":"MinAtar is originally proposed by [Young&Tian+19]
. The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Freeway environment is described as follows:
The player begins at the bottom of the screen and motion is restricted to traveling up and down. Player speed is also restricted such that the player can only move every 3 frames. A reward of +1 is given when the player reaches the top of the screen, at which point the player is returned to the bottom. Cars travel horizontally on the screen and teleport to the other side when the edge is reached. When hit by a car, the player is returned to the bottom of the screen. Car direction and speed is indicated by 5 trail channels, the location of the trail gives direction while the specific channel indicates how frequently the car moves (from once every frame to once every 5 frames). Each time the player successfully reaches the top of the screen, the car speeds are randomized. Termination occurs after 2500 frames have elapsed.
github.com/kenjyoung/MinAtar - freeway.py
"},{"location":"minatar_freeway/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 1
Number of actions 3
Observation shape (10, 10, 7)
Observation type bool
Rewards {0, 1}
"},{"location":"minatar_freeway/#observation","title":"Observation","text":"Index Channel [:, :, 0]
Chicken [:, :, 1]
Car [:, :, 2]
Speed 1 [:, :, 3]
Speed 2 [:, :, 4]
Speed 3 [:, :, 5]
Speed 4"},{"location":"minatar_freeway/#action","title":"Action","text":"TBA
"},{"location":"minatar_freeway/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)[Young&Tian+19]
\"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments\" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
"},{"location":"minatar_seaquest/","title":"MinAtar Seaquest","text":""},{"location":"minatar_seaquest/#usage","title":"Usage","text":"Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
pip install pgx-minatar\n
Then, you can use the environment as follows:
import pgx\nenv = pgx.make(\"minatar-seaquest\")\n
"},{"location":"minatar_seaquest/#description","title":"Description","text":"MinAtar is originally proposed by [Young&Tian+19]
. The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Seaquest environment is described as follows:
The player controls a submarine consisting of two cells, front and back, to allow direction to be determined. The player can also fire bullets from the front of the submarine. Enemies consist of submarines and fish, distinguished by the fact that submarines shoot bullets and fish do not. A reward of +1 is given each time an enemy is struck by one of the player's bullets, at which point the enemy is also removed. There are also divers which the player can move onto to pick up, doing so increments a bar indicated by another channel along the bottom of the screen. The player also has a limited supply of oxygen indicated by another bar in another channel. Oxygen degrades over time, and is replenished whenever the player moves to the top of the screen as long as the player has at least one rescued diver on board. The player can carry a maximum of 6 divers. When surfacing with less than 6, one diver is removed. When surfacing with 6, all divers are removed and a reward is given for each active cell in the oxygen bar. Each time the player surfaces the difficulty is increased by increasing the spawn rate and movement speed of enemies. Termination occurs when the player is hit by an enemy fish, sub or bullet; or when oxygen reached 0; or when the player attempts to surface with no rescued divers. Enemy and diver directions are indicated by a trail channel active in their previous location to reduce partial observability.
github.com/kenjyoung/MinAtar - seaquest.py
"},{"location":"minatar_seaquest/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 1
Number of actions 6
Observation shape (10, 10, 10)
Observation type bool
Rewards {0, 1, ..., 10}
"},{"location":"minatar_seaquest/#observation","title":"Observation","text":"Index Channel [:, :, 0]
Player submarine (front) [:, :, 1]
Player submarine (back) [:, :, 2]
Friendly bullet [:, :, 3]
Trail [:, :, 4]
Enemy bullet [:, :, 5]
Enemy fish [:, :, 6]
Enemy submarine [:, :, 7]
Oxygen guage [:, :, 8]
Diver guage [:, :, 9]
Diver"},{"location":"minatar_seaquest/#action","title":"Action","text":"TBA
"},{"location":"minatar_seaquest/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)[Young&Tian+19]
\"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments\" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
"},{"location":"minatar_space_invaders/","title":"MinAtar Space Invaders","text":""},{"location":"minatar_space_invaders/#usage","title":"Usage","text":"Note that the MinAtar suite is provided as a separate extension for Pgx (pgx-minatar
). Therefore, please run the following command additionaly to use the MinAtar suite in Pgx:
pip install pgx-minatar\n
Then, you can use the environment as follows:
import pgx\nenv = pgx.make(\"minatar-space_invaders\")\n
"},{"location":"minatar_space_invaders/#description","title":"Description","text":"MinAtar is originally proposed by [Young&Tian+19]
. The Pgx implementation is intended to be the exact copy of the original MinAtar implementation in JAX. The Space Invaders environment is described as follows:
The player controls a cannon at the bottom of the screen and can shoot bullets upward at a cluster of aliens above. The aliens move across the screen until one of them hits the edge, at which point they all move down and switch directions. The current alien direction is indicated by 2 channels (one for left and one for right) one of which is active at the location of each alien. A reward of +1 is given each time an alien is shot, and that alien is also removed. The aliens will also shoot bullets back at the player. When few aliens are left, alien speed will begin to increase. When only one alien is left, it will move at one cell per frame. When a wave of aliens is fully cleared a new one will spawn which moves at a slightly faster speed than the last. Termination occurs when an alien or bullet hits the player.
github.com/kenjyoung/MinAtar - space_invaders.py
"},{"location":"minatar_space_invaders/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 1
Number of actions 4
Observation shape (10, 10, 6)
Observation type bool
Rewards {0, 1}
"},{"location":"minatar_space_invaders/#observation","title":"Observation","text":"Index Channel [:, :, 0]
Cannon [:, :, 1]
Alien [:, :, 2]
Alien left [:, :, 3]
Alien right [:, :, 4]
Friendly bullet [:, :, 5]
Enemy bullet"},{"location":"minatar_space_invaders/#action","title":"Action","text":"TBA
"},{"location":"minatar_space_invaders/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)[Young&Tian+19]
\"Minatar: An atari-inspired testbed for thorough and reproducible reinforcement learning experiments\" arXiv:1903.03176Pgx is provided under the Apache 2.0 License, but the original MinAtar suite follows the GPL 3.0 License. Therefore, please note that the separated MinAtar extension for Pgx also adheres to the GPL 3.0 License.
"},{"location":"othello/","title":"Othello","text":"darklight
"},{"location":"othello/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"othello\")\n
or you can directly load Othello
class
from pgx.othello import Othello\nenv = Othello()\n
"},{"location":"othello/#description","title":"Description","text":"Othello, or differing in not having a defined starting position, Reversi, is a two-player zero-sum and perfect information abstract strategy board game, usually played on a board with 8 rows and 8 columns and a set of light and a dark turnable pieces for each side. The player's goal is to have a majority of their colored pieces showing at the end of the game, turning over as many of their opponent's pieces as possible. The dark player makes the first move from the starting position, alternating with the light player. Each player has to place a piece on the board such that there exists at least one straight (horizontal, vertical, or diagonal) occupied line of opponent pieces between the new piece and another own piece. After placing the piece, the side turns over (flips, captures) all opponent pieces lying on any straight lines between the new piece and any anchoring own pieces.
Chess Programming Wiki
"},{"location":"othello/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 65 (= 8 x 8 + 1)
Observation shape (8, 8, 2)
Observation type bool
Rewards {-1, 0, 1}
"},{"location":"othello/#observation","title":"Observation","text":"Index Description [:, :, 0]
represents (8, 8)
squares colored by the current player [:, :, 1]
represents (8, 8)
squares colored by the opponent player of current player"},{"location":"othello/#action","title":"Action","text":"Each action ({0, ..., 63}
) represents the square index to be filled. The last 64
-th action represents pass action.
Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Draw 0
"},{"location":"othello/#termination","title":"Termination","text":"Termination happens when all 64 (= 8 x 8)
playable squares are filled.
v0
: Initial release (v1.0.0)
"},{"location":"play2048/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"2048\")\n
or you can directly load Play2048
class
from pgx.paly2048 import Play2048\nenv = Play2048()\n
"},{"location":"play2048/#description","title":"Description","text":"2048 ...
Wikipedia
"},{"location":"play2048/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 1
Number of actions 4
Observation shape (4, 4, 31)
Observation type bool
Rewards {0, 2, 4, ...}
"},{"location":"play2048/#observation","title":"Observation","text":"Our obseervation design basically follows [Antonoglou+22]
:
In our 2048 experiments we used a binary representation of the observation as an input to our model. Specifically, the 4 \u00d7 4 board was flattened into a single vector of size 16, and a binary representation of 31 bits for each number was obtained, for a total size of 496 numbers.
However, instaead of 496
-d flat vector, we employ (4, 4, 31)
vector.
[i, j, b]
represents that square (i, j)
has a tile of 2 ^ b
if b > 0
"},{"location":"play2048/#action","title":"Action","text":"Each action corresnponds to 0 (left)
, 1 (up)
, 2 (right)
, 3 (down)
.
Sum of merged tiles.
"},{"location":"play2048/#termination","title":"Termination","text":"If all squares are filled with tiles and no legal actions are available, the game terminates.
"},{"location":"play2048/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)[Antonoglou+22]
\"Planning in Stochastic Environments with a Learned Modell\", ICLR
"},{"location":"shogi/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"shogi\")\n
or you can directly load Shogi
class
from pgx.shogi import Shogi\nenv = Shogi()\n
"},{"location":"shogi/#description","title":"Description","text":"TBA
"},{"location":"shogi/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 2187
Observation shape (9, 9, 119)
Observation type bool
Rewards {-1, 0, 1}
"},{"location":"shogi/#observation","title":"Observation","text":"We follow the observation design of dlshogi, an open-source shogi AI. Ther original dlshogi implementations are here. Pgx implementation has [9, 9, 119]
shape and [:, :, x]
denotes:
x
Description 0:14
Where my piece x
exists 14:28
Where my pieces x
are attacking 28:31
Where the number of my attacking pieces are >= 1,2,3
respectively 31:45
Where opponent's piece x
exists 45:59
Where opponent's pieces x
are attacking 59:62
Where the number of opponent's attacking pieces are >= 1,2,3
respectively The following planes are all ones ore zeros
x
Description 62:70
My hand has >= 1, ..., 8
Pawn 70:74
My hand has >= 1, 2, 3, 4
Lance 74:78
My hand has >= 1, 2, 3, 4
Knight 78:82
My hand has >= 1, 2, 3, 4
Silver 82:86
My hand has >= 1, 2, 3, 4
Gold 86:88
My hand has >= 1, 2
Bishop 88:90
My hand has >= 1, 2
Rook 90:98
Oppnent's hand has >= 1, ..., 8
Pawn 98:102
Oppnent's hand has >= 1, 2, 3, 4
Lance 102:106
Oppnent's hand has >= 1, 2, 3, 4
Knight 106:110
Oppnent's hand has >= 1, 2, 3, 4
Silver 110:114
Oppnent's hand has >= 1, 2, 3, 4
Gold 114:116
Oppnent's hand has >= 1, 2
Bishop 116:118
Oppnent's hand has >= 1, 2
Rook 118
Ones if checked Note that piece ids are
Piece Id \u6b69\u3000PAWN
0
\u9999\u3000 LANCE
1
\u6842\u3000 KNIGHT
2
\u9280\u3000 SILVER
3
\u89d2\u3000 BISHOP
4
\u98db\u3000 ROOK
5
\u91d1\u3000 GOLD
6
\u7389\u3000 KING
7
\u3068\u3000 PRO_PAWN
8
\u6210\u9999 PRO_LANCE
9
\u6210\u6842 PRO_KNIGHT
10
\u6210\u9280 PRO_SILVER
11
\u99ac\u3000 HORSE
12
\u9f8d\u3000 DRAGON
13
"},{"location":"shogi/#action","title":"Action","text":"The design of action also follows that of dlshogi. There are 2187 = 81 x 27
distinct actions. The action can be decomposed into
direction
from which the piece moves anddestination
to which the piece movesby direction, destination = action // 81, action % 81
. The direction
is encoded by
Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Draw 0
"},{"location":"shogi/#termination","title":"Termination","text":"TBA
"},{"location":"shogi/#version-history","title":"Version History","text":"v0
: Initial release (v1.0.0)
"},{"location":"sparrow_mahjong/#description","title":"Description","text":"
TBA
"},{"location":"tic_tac_toe/","title":"Tic-tac-toe","text":"darklight
"},{"location":"tic_tac_toe/#usage","title":"Usage","text":"
import pgx\nenv = pgx.make(\"tic_tac_toe\")\n
or you can directly load TicTacToe
class
from pgx.tic_tac_toe import TicTacToe\nenv = TicTacToe()\n
"},{"location":"tic_tac_toe/#description","title":"Description","text":"Tic-tac-toe is a paper-and-pencil game for two players who take turns marking the spaces in a three-by-three grid with X or O. The player who succeeds in placing three of their marks in a horizontal, vertical, or diagonal row is the winner.
Wikipedia
"},{"location":"tic_tac_toe/#specs","title":"Specs","text":"Name Value Versionv0
Number of players 2
Number of actions 9
Observation shape (3, 3, 2)
Observation type bool
Rewards {-1, 0, 1}
"},{"location":"tic_tac_toe/#observation","title":"Observation","text":"Index Description [:, :, 0]
represents (3, 3)
squares filled by the current player [:, :, 1]
represents (3, 3)
squares filled by the opponent player of current player"},{"location":"tic_tac_toe/#action","title":"Action","text":"Each action represents the square index to be filled.
"},{"location":"tic_tac_toe/#rewards","title":"Rewards","text":"Non-zero rewards are given only at the terminal states. The reward at terminal state is described in this table:
Reward Win+1
Lose -1
Draw 0
"},{"location":"tic_tac_toe/#termination","title":"Termination","text":"Termination happens when
v0
: Initial release (v1.0.0)+ +
++ +
+or you can directly load Shogi
class
TBA
+Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +2187 |
+
Observation shape | +(9, 9, 119) |
+
Observation type | +bool |
+
Rewards | +{-1, 0, 1} |
+
We follow the observation design of dlshogi, an open-source shogi AI.
+Ther original dlshogi implementations are here.
+Pgx implementation has [9, 9, 119]
shape and [:, :, x]
denotes:
x |
+Description | +
---|---|
0:14 |
+Where my piece x exists |
+
14:28 |
+Where my pieces x are attacking |
+
28:31 |
+Where the number of my attacking pieces are >= 1,2,3 respectively |
+
31:45 |
+Where opponent's piece x exists |
+
45:59 |
+Where opponent's pieces x are attacking |
+
59:62 |
+Where the number of opponent's attacking pieces are >= 1,2,3 respectively |
+
The following planes are all ones ore zeros
+x |
+Description | +
---|---|
62:70 |
+My hand has >= 1, ..., 8 Pawn |
+
70:74 |
+My hand has >= 1, 2, 3, 4 Lance |
+
74:78 |
+My hand has >= 1, 2, 3, 4 Knight |
+
78:82 |
+My hand has >= 1, 2, 3, 4 Silver |
+
82:86 |
+My hand has >= 1, 2, 3, 4 Gold |
+
86:88 |
+My hand has >= 1, 2 Bishop |
+
88:90 |
+My hand has >= 1, 2 Rook |
+
90:98 |
+Oppnent's hand has >= 1, ..., 8 Pawn |
+
98:102 |
+Oppnent's hand has >= 1, 2, 3, 4 Lance |
+
102:106 |
+Oppnent's hand has >= 1, 2, 3, 4 Knight |
+
106:110 |
+Oppnent's hand has >= 1, 2, 3, 4 Silver |
+
110:114 |
+Oppnent's hand has >= 1, 2, 3, 4 Gold |
+
114:116 |
+Oppnent's hand has >= 1, 2 Bishop |
+
116:118 |
+Oppnent's hand has >= 1, 2 Rook |
+
118 |
+Ones if checked | +
Note that piece ids are
+Piece | +Id | +
---|---|
歩 PAWN |
+0 |
+
香 LANCE |
+1 |
+
桂 KNIGHT |
+2 |
+
銀 SILVER |
+3 |
+
角 BISHOP |
+4 |
+
飛 ROOK |
+5 |
+
金 GOLD |
+6 |
+
玉 KING |
+7 |
+
と PRO_PAWN |
+8 |
+
成香 PRO_LANCE |
+9 |
+
成桂 PRO_KNIGHT |
+10 |
+
成銀 PRO_SILVER |
+11 |
+
馬 HORSE |
+12 |
+
龍 DRAGON |
+13 |
+
The design of action also follows that of dlshogi.
+There are 2187 = 81 x 27
distinct actions.
+The action can be decomposed into
direction
from which the piece moves anddestination
to which the piece movesby direction, destination = action // 81, action % 81
.
+The direction
is encoded by
id | +direction | +
---|---|
0 | +Up | +
1 | +Up left | +
2 | +Up right | +
3 | +Left | +
4 | +Right | +
5 | +Down | +
6 | +Down left | +
7 | +Down right | +
8 | +Up2 left | +
9 | +Up2 right | +
10 | +Promote + Up | +
11 | +Promote + Up left | +
12 | +Promote + Up right | +
13 | +Promote + Left | +
14 | +Promote + Right | +
15 | +Promote + Down | +
16 | +Promote + Down left | +
17 | +Promote + Down right | +
18 | +Promote + Up2 left | +
19 | +Promote + Up2 right | +
20 | +Drop Pawn | +
21 | +Drop Lance | +
22 | +Drop Knight | +
23 | +Drop Silver | +
24 | +Drop Bishop | +
25 | +Drop Rook | +
26 | +Drop Gold | +
Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Draw | +0 |
+
TBA
+v0
: Initial release (v1.0.0)+ +
++ +
+TBA
+ + + + + + ++ +
++ +
+or you can directly load TicTacToe
class
++Tic-tac-toe is a paper-and-pencil game for two players who take turns marking the spaces in a three-by-three grid with X or O. The player who succeeds in placing three of their marks in a horizontal, vertical, or diagonal row is the winner.
+ +
Name | +Value | +
---|---|
Version | +v0 |
+
Number of players | +2 |
+
Number of actions | +9 |
+
Observation shape | +(3, 3, 2) |
+
Observation type | +bool |
+
Rewards | +{-1, 0, 1} |
+
Index | +Description | +
---|---|
[:, :, 0] |
+represents (3, 3) squares filled by the current player |
+
[:, :, 1] |
+represents (3, 3) squares filled by the opponent player of current player |
+
Each action represents the square index to be filled.
+Non-zero rewards are given only at the terminal states. +The reward at terminal state is described in this table:
++ | Reward | +
---|---|
Win | ++1 |
+
Lose | +-1 |
+
Draw | +0 |
+
Termination happens when
+v0
: Initial release (v1.0.0)