From bb3c6ce75ec79217d0df63c0a269864b2e951edf Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 10:21:32 -0400 Subject: [PATCH 01/10] switch to using header api calls instead of block api calls --- gpu-miner.c | 16 ++++------------ network.c | 20 ++++++++------------ network.h | 4 ++-- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/gpu-miner.c b/gpu-miner.c index 3842853..eee6b14 100644 --- a/gpu-miner.c +++ b/gpu-miner.c @@ -63,12 +63,8 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) { memset(headerHash, 255, 32); memset(target, 255, 32); - // Store block from siad - uint8_t *block; - size_t blocklen = 0; - // Get new block header and target - if (get_block_for_work(curl, target, blockHeader, &block, &blocklen) != 0) { + if (get_header_for_work(curl, target, blockHeader) != 0) { return 0; } @@ -84,7 +80,6 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) { printf("e.g. \"./gpu-miner -s 3 -c 200\"\n"); printf("Waiting for problem to be resolved..."); fflush(stdout); - return -1; } target_corrupt_flag = 0; @@ -118,17 +113,14 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) { // Did we find one? if (memcmp(headerHash, target, 8) < 0) { - // Copy nonce to block - memcpy(block+32, nonceOut, 8); - submit_block(curl, block, blocklen); + // Copy nonce to header. + memcpy(blockHeader+32, nonceOut, 8); + submit_header(curl, blockHeader); blocks_mined++; return -1; } } - // Free memory allocated in network.c - free(block); - // Hashrate is inaccurate if a block was found #ifdef __linux__ clock_gettime(CLOCK_REALTIME, &end); diff --git a/network.c b/network.c index 5346a54..919edb4 100644 --- a/network.c +++ b/network.c @@ -14,8 +14,8 @@ char *bfw_url, *submit_url; void set_port(char *port) { bfw_url = malloc(29 + strlen(port)); submit_url = malloc(28 + strlen(port)); - sprintf(bfw_url, "localhost:%s/miner/blockforwork", port); - sprintf(submit_url, "localhost:%s/miner/submitblock", port); + sprintf(bfw_url, "localhost:%s/miner/headerforwork", port); + sprintf(submit_url, "localhost:%s/miner/submitheader", port); } // Write network data to an array of bytes @@ -34,7 +34,7 @@ size_t writefunc(void *ptr, size_t size, size_t nmemb, struct inData *in) { return size*nmemb; } -int get_block_for_work(CURL *curl, uint8_t *target, uint8_t *header, uint8_t **block, size_t *blocklen) { +int get_header_for_work(CURL *curl, uint8_t *target, uint8_t *header) { if (!curl) { fprintf(stderr, "Invalid curl object passed to get_block_for_work()\n"); exit(1); @@ -55,31 +55,27 @@ int get_block_for_work(CURL *curl, uint8_t *target, uint8_t *header, uint8_t **b fprintf(stderr, "Are you sure that siad is running?\n"); exit(1); } - if (in.len < 174) { - fprintf(stderr, "curl did not receive enough bytes (got %zu, expected at least 174)\n", in.len); + if (in.len != 112) { + fprintf(stderr, "curl did not receive correct bytes (got %zu, expected 112)\n", in.len); return 1; } // Copy data to return - *blocklen = in.len - 112; - *block = (uint8_t*)malloc(*blocklen); memcpy(target, in.bytes, 32); memcpy(header, in.bytes+32, 80); - memcpy(*block, in.bytes+112, in.len-112); return 0; } -void submit_block(CURL *curl, uint8_t *block, size_t blocklen) { +void submit_header(CURL *curl, uint8_t *header) { if (curl) { CURLcode res; - curl_off_t numBytes = blocklen; curl_easy_reset(curl); curl_easy_setopt(curl, CURLOPT_URL, submit_url); curl_easy_setopt(curl, CURLOPT_POST, 1); - curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, numBytes); - curl_easy_setopt(curl, CURLOPT_POSTFIELDS, block); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, 80); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, header); // Prevent printing to stdout curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writefunc); curl_easy_setopt(curl, CURLOPT_WRITEDATA, NULL); diff --git a/network.h b/network.h index b977ca5..73bc8bf 100644 --- a/network.h +++ b/network.h @@ -2,5 +2,5 @@ #include void set_port(char *port); -int get_block_for_work(CURL *curl, uint8_t *target, uint8_t *header, uint8_t **block, size_t *blocklen); -void submit_block(CURL *curl, uint8_t *block, size_t blocklen); \ No newline at end of file +int get_header_for_work(CURL *curl, uint8_t *target, uint8_t *header); +void submit_header(CURL *curl, uint8_t *header); From 1ea426c89a5e084981edb25a01a54e7da0720a64 Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 11:18:07 -0400 Subject: [PATCH 02/10] minor improvements --- gpu-miner.c | 2 +- gpu-miner.cl | 36 ++++++++++++++---------------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/gpu-miner.c b/gpu-miner.c index eee6b14..fd7db73 100644 --- a/gpu-miner.c +++ b/gpu-miner.c @@ -69,7 +69,7 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) { } // Check for target corruption - if (target[0] != 0 || target[1] != 0 || target[2] != 0 || target[3] != 0) { + if (target[0] != 0 || target[1] != 0) { if (target_corrupt_flag) { return -1; } diff --git a/gpu-miner.cl b/gpu-miner.cl index 40d779f..7baf01f 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -7,11 +7,13 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl private uchar target[32]; headerHash[0] = 255; - int i, z; + int i; +#pragma unroll for (i = 0; i < 32; i++) { target[i] = targ[i]; header[i] = headerIn[i]; } +#pragma unroll for (i = 32; i < 80; i++) { header[i] = headerIn[i]; } @@ -28,7 +30,7 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl blake2b(headerHash, header); // Compare header to target - z = 0; + int z = 0; while (target[z] == headerHash[z]) { z++; } @@ -57,6 +59,7 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t int i = 0 ; char *dest8 = (char*)dest; char *src8 = (char*)src; +#pragma unroll for (int i = 0; i < num; i++) { dest8[i] = src8[i]; } @@ -68,26 +71,15 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t #define ALIGN(x) __attribute__((aligned(x))) #endif - enum blake2b_constant - { - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 - }; - -#pragma pack(push, 1) - ALIGN( 64 ) typedef struct __blake2b_state + typedef struct __blake2b_state { ulong h[8]; ulong t[2]; ulong f[2]; - uchar buf[2 * BLAKE2B_BLOCKBYTES]; + uchar buf[256]; size_t buflen; uchar last_node; } blake2b_state; -#pragma pack(pop) // Streaming API int blake2b_update( __private blake2b_state *S, __private const uchar *in, __private ulong inlen ); @@ -133,7 +125,7 @@ __constant uchar blake2b_sigma[12][16] = { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; -static int blake2b_compress( __private blake2b_state *S, __private const uchar block[BLAKE2B_BLOCKBYTES] ) +static int blake2b_compress( __private blake2b_state *S, __private const uchar block[128] ) { ulong m[16]; ulong v[16]; @@ -207,17 +199,17 @@ int blake2b( __private uchar *out, __private uchar *in ) ulong inlen = 80; size_t left = S->buflen; - size_t fill = 2 * BLAKE2B_BLOCKBYTES - left; + size_t fill = 2 * 128 - left; if( inlen > fill ) { clmemcpy( S->buf + left, in, fill ); // Fill buffer S->buflen += fill; blake2b_compress( S, S->buf ); // Compress - clmemcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left - S->buflen -= BLAKE2B_BLOCKBYTES; + clmemcpy( S->buf, S->buf + 128, 128 ); // Shift buffer left + S->buflen -= 128; } - else // inlen <= fill + else { clmemcpy( S->buf + left, in, inlen ); S->buflen += inlen; // Be lazy, do not compress @@ -226,10 +218,10 @@ int blake2b( __private uchar *out, __private uchar *in ) S->t[0] += S->buflen; S->f[0] = ~((ulong)0); - clmemset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); // Padding + clmemset( S->buf + S->buflen, 0, 2 * 128 - S->buflen ); // Padding blake2b_compress( S, S->buf ); - uchar buffer[BLAKE2B_OUTBYTES]; + uchar buffer[64]; for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); From d50cd1e4be60e04e79791adb28c544d3561585c6 Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 11:47:00 -0400 Subject: [PATCH 03/10] stubborn inlining of everything --- gpu-miner.cl | 59 +++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 42 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index 7baf01f..caeb69b 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -2,7 +2,7 @@ int blake2b( uchar *out, uchar *in ); // The kernel that grinds nonces until it finds a hash below the target __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) { - private uchar header[80]; + private uchar header[256]; private uchar headerHash[32]; private uchar target[32]; headerHash[0] = 255; @@ -77,14 +77,9 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t ulong t[2]; ulong f[2]; uchar buf[256]; - size_t buflen; uchar last_node; } blake2b_state; - // Streaming API - int blake2b_update( __private blake2b_state *S, __private const uchar *in, __private ulong inlen ); - int blake2b_final( __private blake2b_state *S, __private uchar *out ); - static inline ulong load64( __private const void *src ) { return *( ulong * )( src ); @@ -127,12 +122,27 @@ __constant uchar blake2b_sigma[12][16] = static int blake2b_compress( __private blake2b_state *S, __private const uchar block[128] ) { + return 0; +} + +int blake2b( __private uchar *out, __private uchar *in ) +{ + // Initialize a state. + private blake2b_state S[1]; + clmemset( S, 0, sizeof( blake2b_state ) ); + for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; + S->h[0] ^= 0x0000000001010020UL; + + S->t[0] += 80; + S->f[0] = ~((ulong)0); + clmemset( in + 80, 0, 2 * 128 - 80 ); // Padding + ulong m[16]; ulong v[16]; int i; for( i = 0; i < 16; ++i ) - m[i] = load64( block + i * sizeof( m[i] ) ); + m[i] = load64( in + i * sizeof( m[i] ) ); for( i = 0; i < 8; ++i ) v[i] = S->h[i]; @@ -185,41 +195,6 @@ static int blake2b_compress( __private blake2b_state *S, __private const uchar b #undef G #undef ROUND - return 0; -} - -// inlen, at least, should be ulong. Others can be size_t. -int blake2b( __private uchar *out, __private uchar *in ) -{ - private blake2b_state S[1]; - - clmemset( S, 0, sizeof( blake2b_state ) ); - for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; - S->h[0] ^= 0x0000000001010020UL; - - ulong inlen = 80; - size_t left = S->buflen; - size_t fill = 2 * 128 - left; - - if( inlen > fill ) - { - clmemcpy( S->buf + left, in, fill ); // Fill buffer - S->buflen += fill; - blake2b_compress( S, S->buf ); // Compress - clmemcpy( S->buf, S->buf + 128, 128 ); // Shift buffer left - S->buflen -= 128; - } - else - { - clmemcpy( S->buf + left, in, inlen ); - S->buflen += inlen; // Be lazy, do not compress - } - - - S->t[0] += S->buflen; - S->f[0] = ~((ulong)0); - clmemset( S->buf + S->buflen, 0, 2 * 128 - S->buflen ); // Padding - blake2b_compress( S, S->buf ); uchar buffer[64]; for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer From b1a9b2989ecddcd9869c9d6cb2e0297db150b108 Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 11:49:44 -0400 Subject: [PATCH 04/10] clear extra memset --- gpu-miner.cl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index caeb69b..d1f468f 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -2,7 +2,7 @@ int blake2b( uchar *out, uchar *in ); // The kernel that grinds nonces until it finds a hash below the target __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) { - private uchar header[256]; + private uchar header[256] = {0}; private uchar headerHash[32]; private uchar target[32]; headerHash[0] = 255; @@ -135,7 +135,6 @@ int blake2b( __private uchar *out, __private uchar *in ) S->t[0] += 80; S->f[0] = ~((ulong)0); - clmemset( in + 80, 0, 2 * 128 - 80 ); // Padding ulong m[16]; ulong v[16]; From 8dee5f831a98b12fbd92386baf49f0e727e22bab Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 11:59:21 -0400 Subject: [PATCH 05/10] inline blake2b --- gpu-miner.cl | 99 ++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index d1f468f..d0eadf2 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -1,51 +1,5 @@ int blake2b( uchar *out, uchar *in ); -// The kernel that grinds nonces until it finds a hash below the target -__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) { - private uchar header[256] = {0}; - private uchar headerHash[32]; - private uchar target[32]; - headerHash[0] = 255; - - int i; -#pragma unroll - for (i = 0; i < 32; i++) { - target[i] = targ[i]; - header[i] = headerIn[i]; - } -#pragma unroll - for (i = 32; i < 80; i++) { - header[i] = headerIn[i]; - } - - // Set nonce - private int id = get_global_id(0); - // Support global work sizes of up to 256^4 - 1 - header[32] = id / (256 * 256 * 256); - header[33] = id / (256 * 256); - header[34] = id / 256; - header[35] = id % 256; - - // Hash the header - blake2b(headerHash, header); - - // Compare header to target - int z = 0; - while (target[z] == headerHash[z]) { - z++; - } - if (headerHash[z] < target[z]) { - // Transfer the output to global space. - for (i = 0; i < 8; i++) { - nonceOut[i] = header[i + 32]; - } - for (i = 0; i < 32; i++) { - hashOut[i] = headerHash[i]; - } - return; - } -} - // Implementations of clmemset and memcopy void *clmemset( __private void *s, __private int c, __private size_t n) { uchar *p = s; @@ -127,6 +81,38 @@ static int blake2b_compress( __private blake2b_state *S, __private const uchar b int blake2b( __private uchar *out, __private uchar *in ) { + + return 0; +} + +// The kernel that grinds nonces until it finds a hash below the target +__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) { + private uchar header[256] = {0}; + private uchar headerHash[32]; + private uchar target[32]; + headerHash[0] = 255; + + int i; +#pragma unroll + for (i = 0; i < 32; i++) { + target[i] = targ[i]; + header[i] = headerIn[i]; + } +#pragma unroll + for (i = 32; i < 80; i++) { + header[i] = headerIn[i]; + } + + // Set nonce + private int id = get_global_id(0); + // Support global work sizes of up to 256^4 - 1 + header[32] = id / (256 * 256 * 256); + header[33] = id / (256 * 256); + header[34] = id / 256; + header[35] = id % 256; + + // Hash the header + // blake2b(headerHash, header); // Initialize a state. private blake2b_state S[1]; clmemset( S, 0, sizeof( blake2b_state ) ); @@ -138,10 +124,9 @@ int blake2b( __private uchar *out, __private uchar *in ) ulong m[16]; ulong v[16]; - int i; for( i = 0; i < 16; ++i ) - m[i] = load64( in + i * sizeof( m[i] ) ); + m[i] = load64( header + i * sizeof( m[i] ) ); for( i = 0; i < 8; ++i ) v[i] = S->h[i]; @@ -199,7 +184,21 @@ int blake2b( __private uchar *out, __private uchar *in ) for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); - clmemcpy( out, buffer, 32 ); + clmemcpy( headerHash, buffer, 32 ); - return 0; + // Compare header to target + int z = 0; + while (target[z] == headerHash[z]) { + z++; + } + if (headerHash[z] < target[z]) { + // Transfer the output to global space. + for (i = 0; i < 8; i++) { + nonceOut[i] = header[i + 32]; + } + for (i = 0; i < 32; i++) { + hashOut[i] = headerHash[i]; + } + return; + } } From 369fbfe9357991004090269c6d4f6b2110c4f36e Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 12:04:23 -0400 Subject: [PATCH 06/10] memcpy to headerHash --- gpu-miner.cl | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index d0eadf2..83994fc 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -1,5 +1,3 @@ -int blake2b( uchar *out, uchar *in ); - // Implementations of clmemset and memcopy void *clmemset( __private void *s, __private int c, __private size_t n) { uchar *p = s; @@ -74,17 +72,6 @@ __constant uchar blake2b_sigma[12][16] = { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; -static int blake2b_compress( __private blake2b_state *S, __private const uchar block[128] ) -{ - return 0; -} - -int blake2b( __private uchar *out, __private uchar *in ) -{ - - return 0; -} - // The kernel that grinds nonces until it finds a hash below the target __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) { private uchar header[256] = {0}; From 4324c4527d7af364a114144ef75ed64eed61d47b Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 12:15:14 -0400 Subject: [PATCH 07/10] toss blake2b state struct --- gpu-miner.cl | 60 ++++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index 83994fc..29b8f8c 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -1,12 +1,3 @@ -// Implementations of clmemset and memcopy -void *clmemset( __private void *s, __private int c, __private size_t n) { - uchar *p = s; - while(n--) { - *p++ = (uchar)c; - } - return s; -} - void clmemcpy( __private void *dest, __private const void *src, __private size_t num) { int i = 0 ; char *dest8 = (char*)dest; @@ -23,14 +14,6 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t #define ALIGN(x) __attribute__((aligned(x))) #endif - typedef struct __blake2b_state - { - ulong h[8]; - ulong t[2]; - ulong f[2]; - uchar buf[256]; - uchar last_node; - } blake2b_state; static inline ulong load64( __private const void *src ) { @@ -73,16 +56,16 @@ __constant uchar blake2b_sigma[12][16] = }; // The kernel that grinds nonces until it finds a hash below the target -__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) { +__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targetIn, __global uchar *nonceOut) { private uchar header[256] = {0}; - private uchar headerHash[32]; + private uchar headerHash[64]; private uchar target[32]; headerHash[0] = 255; int i; #pragma unroll for (i = 0; i < 32; i++) { - target[i] = targ[i]; + target[i] = targetIn[i]; header[i] = headerIn[i]; } #pragma unroll @@ -98,16 +81,13 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl header[34] = id / 256; header[35] = id % 256; - // Hash the header - // blake2b(headerHash, header); - // Initialize a state. - private blake2b_state S[1]; - clmemset( S, 0, sizeof( blake2b_state ) ); - for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; - S->h[0] ^= 0x0000000001010020UL; - - S->t[0] += 80; - S->f[0] = ~((ulong)0); + // BLAKE2B START + ulong h[8] = {0}; + ulong t[2] = {80, 0}; + ulong f[2] = {0}; + for( int i = 0; i < 8; ++i ) h[i] = blake2b_IV[i]; + h[0] ^= 0x0000000001010020UL; + f[0] = ~((ulong)0); ulong m[16]; ulong v[16]; @@ -116,16 +96,16 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl m[i] = load64( header + i * sizeof( m[i] ) ); for( i = 0; i < 8; ++i ) - v[i] = S->h[i]; + v[i] = h[i]; v[ 8] = blake2b_IV[0]; v[ 9] = blake2b_IV[1]; v[10] = blake2b_IV[2]; v[11] = blake2b_IV[3]; - v[12] = S->t[0] ^ blake2b_IV[4]; - v[13] = S->t[1] ^ blake2b_IV[5]; - v[14] = S->f[0] ^ blake2b_IV[6]; - v[15] = S->f[1] ^ blake2b_IV[7]; + v[12] = t[0] ^ blake2b_IV[4]; + v[13] = t[1] ^ blake2b_IV[5]; + v[14] = f[0] ^ blake2b_IV[6]; + v[15] = f[1] ^ blake2b_IV[7]; #define G(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2b_sigma[r][2*i+0]]; \ @@ -162,16 +142,14 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl ROUND( 11 ); for( i = 0; i < 8; ++i ) - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + h[i] = h[i] ^ v[i] ^ v[i + 8]; #undef G #undef ROUND - uchar buffer[64]; for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer - store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); - - clmemcpy( headerHash, buffer, 32 ); + store64( headerHash + sizeof( h[i] ) * i, h[i] ); + // BLAKE2B END // Compare header to target int z = 0; @@ -180,9 +158,11 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl } if (headerHash[z] < target[z]) { // Transfer the output to global space. +#pragma unroll for (i = 0; i < 8; i++) { nonceOut[i] = header[i + 32]; } +#pragma unroll for (i = 0; i < 32; i++) { hashOut[i] = headerHash[i]; } From eec327dce41acb900eb53889735fcd792dfeb89b Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 12:41:15 -0400 Subject: [PATCH 08/10] smoosh things --- gpu-miner.cl | 103 ++++++++++++++++----------------------------------- 1 file changed, 32 insertions(+), 71 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index 29b8f8c..7013fbd 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -1,20 +1,3 @@ -void clmemcpy( __private void *dest, __private const void *src, __private size_t num) { - int i = 0 ; - char *dest8 = (char*)dest; - char *src8 = (char*)src; -#pragma unroll - for (int i = 0; i < num; i++) { - dest8[i] = src8[i]; - } -} - -#if defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__((aligned(x))) -#endif - - static inline ulong load64( __private const void *src ) { return *( ulong * )( src ); @@ -29,16 +12,6 @@ static inline ulong rotr64( __private const ulong w, __private const unsigned c { return ( w >> c ) | ( w << ( 64 - c ) ); } - -// blake2b-ref.c -__constant ulong blake2b_IV[8] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 -}; - __constant uchar blake2b_sigma[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , @@ -60,8 +33,8 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl private uchar header[256] = {0}; private uchar headerHash[64]; private uchar target[32]; - headerHash[0] = 255; + // Transfer inputs from global memory int i; #pragma unroll for (i = 0; i < 32; i++) { @@ -82,52 +55,43 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl header[35] = id % 256; // BLAKE2B START - ulong h[8] = {0}; + ulong iv[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 }; + ulong v[16]; ulong t[2] = {80, 0}; - ulong f[2] = {0}; - for( int i = 0; i < 8; ++i ) h[i] = blake2b_IV[i]; - h[0] ^= 0x0000000001010020UL; - f[0] = ~((ulong)0); - + ulong f[2] = {~0, 0}; ulong m[16]; - ulong v[16]; - for( i = 0; i < 16; ++i ) m[i] = load64( header + i * sizeof( m[i] ) ); - + v[ 8] = iv[0]; + v[ 9] = iv[1]; + v[10] = iv[2]; + v[11] = iv[3]; + v[12] = t[0] ^ iv[4]; + v[13] = t[1] ^ iv[5]; + v[14] = f[0] ^ iv[6]; + v[15] = f[1] ^ iv[7]; + iv[0] ^= 0x0000000001010020UL; for( i = 0; i < 8; ++i ) - v[i] = h[i]; + v[i] = iv[i]; - v[ 8] = blake2b_IV[0]; - v[ 9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = t[0] ^ blake2b_IV[4]; - v[13] = t[1] ^ blake2b_IV[5]; - v[14] = f[0] ^ blake2b_IV[6]; - v[15] = f[1] ^ blake2b_IV[7]; #define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2*i+0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2*i+1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while(0) + a = a + b + m[blake2b_sigma[r][2*i+0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2*i+1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); #define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); @@ -140,15 +104,12 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl ROUND( 9 ); ROUND( 10 ); ROUND( 11 ); - for( i = 0; i < 8; ++i ) - h[i] = h[i] ^ v[i] ^ v[i + 8]; - + iv[i] = iv[i] ^ v[i] ^ v[i + 8]; #undef G #undef ROUND - for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer - store64( headerHash + sizeof( h[i] ) * i, h[i] ); + store64( headerHash + sizeof( iv[i] ) * i, iv[i] ); // BLAKE2B END // Compare header to target From fb82351d541729792a734c4d476a8dddb0ee9a3e Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 13:15:43 -0400 Subject: [PATCH 09/10] merge memory structures --- gpu-miner.cl | 90 +++++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index 7013fbd..d9e0b9d 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -1,37 +1,11 @@ -static inline ulong load64( __private const void *src ) -{ - return *( ulong * )( src ); -} - -static inline void store64( __private void *dst, __private ulong w ) -{ - *( ulong * )( dst ) = w; -} - static inline ulong rotr64( __private const ulong w, __private const unsigned c ) { return ( w >> c ) | ( w << ( 64 - c ) ); } -__constant uchar blake2b_sigma[12][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; // The kernel that grinds nonces until it finds a hash below the target __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targetIn, __global uchar *nonceOut) { private uchar header[256] = {0}; - private uchar headerHash[64]; private uchar target[32]; // Transfer inputs from global memory @@ -48,32 +22,26 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl // Set nonce private int id = get_global_id(0); - // Support global work sizes of up to 256^4 - 1 header[32] = id / (256 * 256 * 256); header[33] = id / (256 * 256); header[34] = id / 256; header[35] = id % 256; - // BLAKE2B START - ulong iv[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 }; - ulong v[16]; - ulong t[2] = {80, 0}; - ulong f[2] = {~0, 0}; - ulong m[16]; - for( i = 0; i < 16; ++i ) - m[i] = load64( header + i * sizeof( m[i] ) ); - v[ 8] = iv[0]; - v[ 9] = iv[1]; - v[10] = iv[2]; - v[11] = iv[3]; - v[12] = t[0] ^ iv[4]; - v[13] = t[1] ^ iv[5]; - v[14] = f[0] ^ iv[6]; - v[15] = f[1] ^ iv[7]; - iv[0] ^= 0x0000000001010020UL; - for( i = 0; i < 8; ++i ) - v[i] = iv[i]; - + uchar blake2b_sigma[12][16] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + }; #define G(r,i,a,b,c,d) \ a = a + b + m[blake2b_sigma[r][2*i+0]]; \ d = rotr64(d ^ a, 32); \ @@ -92,6 +60,26 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl G(r,5,v[ 1],v[ 6],v[11],v[12]); \ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); + // BLAKE2B START + + ulong iv[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 }; + ulong v[16]; + ulong t[2] = {80, 0}; + ulong f[2] = {~0, 0}; + ulong m[16]; + for( i = 0; i < 16; ++i ) + m[i] = *(ulong*)( header + i * 8 ); + v[ 8] = iv[0]; + v[ 9] = iv[1]; + v[10] = iv[2]; + v[11] = iv[3]; + v[12] = t[0] ^ iv[4]; + v[13] = t[1] ^ iv[5]; + v[14] = f[0] ^ iv[6]; + v[15] = f[1] ^ iv[7]; + iv[0] ^= 0x0000000001010020UL; + for( i = 0; i < 8; ++i ) + v[i] = iv[i]; ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); @@ -104,13 +92,15 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl ROUND( 9 ); ROUND( 10 ); ROUND( 11 ); + private uchar headerHash[64]; for( i = 0; i < 8; ++i ) iv[i] = iv[i] ^ v[i] ^ v[i + 8]; -#undef G -#undef ROUND for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer - store64( headerHash + sizeof( iv[i] ) * i, iv[i] ); + *(ulong*)(headerHash + 8 * i) = iv[i]; + // BLAKE2B END +#undef G +#undef ROUND // Compare header to target int z = 0; From e7c9524b5075a68aa2b00eeaedf5102717b5a3a9 Mon Sep 17 00:00:00 2001 From: David Vorick Date: Tue, 9 Jun 2015 14:28:18 -0400 Subject: [PATCH 10/10] 32bit magic --- gpu-miner.cl | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/gpu-miner.cl b/gpu-miner.cl index d9e0b9d..92f9ec7 100644 --- a/gpu-miner.cl +++ b/gpu-miner.cl @@ -1,31 +1,24 @@ -static inline ulong rotr64( __private const ulong w, __private const unsigned c ) +static inline ulong rotr64( __const ulong w, __const unsigned c ) { return ( w >> c ) | ( w << ( 64 - c ) ); } // The kernel that grinds nonces until it finds a hash below the target -__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targetIn, __global uchar *nonceOut) { - private uchar header[256] = {0}; - private uchar target[32]; +__kernel void nonceGrind(__global uint *headerIn, __global uchar *hashOut, __global uint *targetIn, __global uchar *nonceOut) { + uchar header[256] = {0}; + uchar target[32]; // Transfer inputs from global memory int i; -#pragma unroll - for (i = 0; i < 32; i++) { - target[i] = targetIn[i]; - header[i] = headerIn[i]; + for (i = 0; i < 8; i++) { + *(uint*)(target + i * 4) = targetIn[i]; } -#pragma unroll - for (i = 32; i < 80; i++) { - header[i] = headerIn[i]; + for (i = 0; i < 20; i++) { + *(uint*)(header + i * 4) = headerIn[i]; } // Set nonce - private int id = get_global_id(0); - header[32] = id / (256 * 256 * 256); - header[33] = id / (256 * 256); - header[34] = id / 256; - header[35] = id % 256; + *(uint*)(header + 32) = get_global_id(0); uchar blake2b_sigma[12][16] = { @@ -92,7 +85,7 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl ROUND( 9 ); ROUND( 10 ); ROUND( 11 ); - private uchar headerHash[64]; + uchar headerHash[64]; for( i = 0; i < 8; ++i ) iv[i] = iv[i] ^ v[i] ^ v[i + 8]; for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer @@ -102,18 +95,16 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl #undef G #undef ROUND - // Compare header to target - int z = 0; - while (target[z] == headerHash[z]) { - z++; + // Compare hash to target + i = 0; + while (target[i] == headerHash[i]) { + i++; } - if (headerHash[z] < target[z]) { + if (headerHash[i] < target[i]) { // Transfer the output to global space. -#pragma unroll for (i = 0; i < 8; i++) { nonceOut[i] = header[i + 32]; } -#pragma unroll for (i = 0; i < 32; i++) { hashOut[i] = headerHash[i]; }