-
Notifications
You must be signed in to change notification settings - Fork 2
/
perft.cu
164 lines (133 loc) · 5.07 KB
/
perft.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#include "device_launch_parameters.h"
#include "perft_bb.h"
#include <math.h>
#include <stdlib.h>
#include <thread>
#include <mutex>
#include "InfInt.h"
#include "launcher.h"
void createNetworkThread();
void endNetworkThread();
int main(int argc, char *argv[])
{
std::srand(std::time(0));
#if PERFT_RECORDS_MODE == 1
processPerftRecords(argc, argv);
return 0;
#endif
BoardPosition testBoard;
int totalGPUs;
cudaGetDeviceCount(&totalGPUs);
printf("No of GPUs detected: %d", totalGPUs);
if (argc >= 4)
{
numGPUs = atoi(argv[3]);
if (numGPUs < 1)
numGPUs = 1;
if (numGPUs > totalGPUs)
numGPUs = totalGPUs;
printf("\nUsing %d GPUs\n", numGPUs);
}
else
{
numGPUs = totalGPUs;
}
checkAndCreateDiskHash();
allocCompleteTT();
#if MULTI_NODE_NETWORK_MODE == 1
createNetworkThread();
#endif
for (int g = 0; g < numGPUs; g++)
{
initGPU(g);
#if USE_TRANSPOSITION_TABLE == 1
setupHashTables128b(TransTables128b[g]);
#endif
MoveGeneratorBitboard::init();
}
// set default device to device 0
cudaSetDevice(0);
// some test board positions from http://chessprogramming.wikispaces.com/Perft+Results
//Utils::readFENString("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1", &testBoard); // start.. 20 positions
Utils::readFENString("r3k2r/p1ppqpb1/bn2pnp1/3PN3/1p2P3/2N2Q1p/PPPBBPPP/R3K2R w KQkq -", &testBoard); // position 2 (caught max bugs for me)
//Utils::readFENString("8/2p5/3p4/KP5r/1R3p1k/8/4P1P1/8 w - -", &testBoard); // position 3
//Utils::readFENString("r2q1rk1/pP1p2pp/Q4n2/bbp1p3/Np6/1B3NBn/pPPP1PPP/R3K2R b KQ - 0 1", &testBoard); // position 4
//Utils::readFENString("r3k2r/Pppp1ppp/1b3nbN/nP6/BBP1P3/q4N2/Pp1P2PP/R2Q1RK1 w kq - 0 1", &testBoard); // mirror of position 4
//Utils::readFENString("rnbqkb1r/pp1p1ppp/2p5/4P3/2B5/8/PPP1NnPP/RNBQK2R w KQkq - 0 6", &testBoard); // position 5
//Utils::readFENString("3Q4/1Q4Q1/4Q3/2Q4R/Q4Q2/3Q4/1Q4Rp/1K1BBNNk w - - 0 1", &testBoard); // - 218 positions.. correct!
//Utils::readFENString("r1b1kbnr/pppp1ppp/2n1p3/6q1/6Q1/2N1P3/PPPP1PPP/R1B1KBNR w KQkq - 4 4", &testBoard); // temp test
int minDepth = 3;
int maxDepth = 3;
char fen[1024];
if (argc >= 3)
{
strcpy(fen, argv[1]);
maxDepth = atoi(argv[2]);
}
else
{
printf("\nUsage perft_gpu <fen> <depth> [<launchdepth>]\n");
printf("\nAs no paramaters were provided... running default test\n");
}
if (strlen(fen) > 5)
{
Utils::readFENString(fen, &testBoard);
}
else
{
Utils::readFENString("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1", &testBoard); // start.. 20 positions
}
Utils::dispBoard(&testBoard);
HexaBitBoardPosition testBB;
Utils::board088ToHexBB(&testBB, &testBoard);
Utils::boardHexBBTo088(&testBoard, &testBB);
// launchDepth is the depth at which the driver kernel launches the work kernels
// we decide launch depth based by estimating memory requirment of the work kernel that would be launched.
// TODO: need more accurate method to estimate launch depth
// branching factor near the root is not accurate. E.g, for start pos, at root branching factor = 20
// and we estimate launch depth = 6.. which would seem quite conservative (20^6 = 64M)
// at depth 10, the avg branching factor is nearly 30 and 30^6 = 729M which is > 10X initial estimate :-/
// At launch depth 6, some launches for perft 9 start using up > 350 MB memory
// 384 MB is not sufficient for computing perft 10 (some of the launches consume more than that)
// and 1 GB is not sufficient for computing perft 11!
uint32 launchDepth = estimateLaunchDepth(&testBB);
launchDepth = min(launchDepth, 11); // don't go too high
#if USE_TRANSPOSITION_TABLE == 0
// for best performance without GPU hash (also set PREALLOCATED_MEMORY_SIZE to 3 x 768MB)
launchDepth = 6; // ankan - test!
#endif
if (argc >= 5)
{
launchDepth = atoi(argv[4]);
}
if (maxDepth < launchDepth)
{
launchDepth = maxDepth;
}
fflush(stdout);
for (int depth = minDepth; depth <= maxDepth; depth++)
{
perftLauncher(&testBB, depth, launchDepth);
fflush(stdout);
}
#if MULTI_NODE_NETWORK_MODE == 1
endNetworkThread();
#endif
#if USE_TRANSPOSITION_TABLE == 1
freeHashTables();
#endif
freeCompleteTT();
for (int g = 0; g < numGPUs; g++)
{
cudaFree(preAllocatedBufferHost[g]);
cudaDeviceReset();
}
#if USE_TRANSPOSITION_TABLE == 1
printf("\nComplete hash sysmem memory usage: %llu bytes\n", ((uint64) chainIndex) * sizeof(CompleteHashEntry));
printf("\nMax tree storage GPU memory usage: %llu bytes\n", maxMemoryUsage);
printf("Regular depth %d Launches: %d\n", GPU_LAUNCH_DEPTH, numRegularLaunches);
printf("Retry launches: %d\n", numRetryLaunches);
printf("No of work items recieved from peers: %llu\n", numItemsFromPeers);
#endif
return 0;
}