From bb3c6ce75ec79217d0df63c0a269864b2e951edf Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 10:21:32 -0400
Subject: [PATCH 01/10] switch to using header api calls instead of block api
 calls

---
 gpu-miner.c | 16 ++++------------
 network.c   | 20 ++++++++------------
 network.h   |  4 ++--
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/gpu-miner.c b/gpu-miner.c
index 3842853..eee6b14 100644
--- a/gpu-miner.c
+++ b/gpu-miner.c
@@ -63,12 +63,8 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) {
 	memset(headerHash, 255, 32);
 	memset(target, 255, 32);
 
-	// Store block from siad
-	uint8_t *block;
-	size_t blocklen = 0;
-
 	// Get new block header and target
-	if (get_block_for_work(curl, target, blockHeader, &block, &blocklen) != 0) {
+	if (get_header_for_work(curl, target, blockHeader) != 0) {
 		return 0;
 	}
 
@@ -84,7 +80,6 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) {
 		printf("e.g. \"./gpu-miner -s 3 -c 200\"\n");
 		printf("Waiting for problem to be resolved...");
 		fflush(stdout);
-		return -1;
 	}
 	target_corrupt_flag = 0;
 
@@ -118,17 +113,14 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) {
 
 		// Did we find one?
 		if (memcmp(headerHash, target, 8) < 0) {
-			// Copy nonce to block
-			memcpy(block+32, nonceOut, 8);
-			submit_block(curl, block, blocklen);
+			// Copy nonce to header.
+			memcpy(blockHeader+32, nonceOut, 8);
+			submit_header(curl, blockHeader);
 			blocks_mined++;
 			return -1;
 		}
 	}
 
-	// Free memory allocated in network.c
-	free(block);
-
 	// Hashrate is inaccurate if a block was found
 	#ifdef __linux__
 	clock_gettime(CLOCK_REALTIME, &end);
diff --git a/network.c b/network.c
index 5346a54..919edb4 100644
--- a/network.c
+++ b/network.c
@@ -14,8 +14,8 @@ char *bfw_url, *submit_url;
 void set_port(char *port) {
 	bfw_url = malloc(29 + strlen(port));
 	submit_url = malloc(28 + strlen(port));
-	sprintf(bfw_url, "localhost:%s/miner/blockforwork", port);
-	sprintf(submit_url, "localhost:%s/miner/submitblock", port);
+	sprintf(bfw_url, "localhost:%s/miner/headerforwork", port);
+	sprintf(submit_url, "localhost:%s/miner/submitheader", port);
 }
 
 // Write network data to an array of bytes
@@ -34,7 +34,7 @@ size_t writefunc(void *ptr, size_t size, size_t nmemb, struct inData *in) {
 	return size*nmemb;
 }
 
-int get_block_for_work(CURL *curl, uint8_t *target, uint8_t *header, uint8_t **block, size_t *blocklen) {
+int get_header_for_work(CURL *curl, uint8_t *target, uint8_t *header) {
 	if (!curl) {
 		fprintf(stderr, "Invalid curl object passed to get_block_for_work()\n");
 		exit(1);
@@ -55,31 +55,27 @@ int get_block_for_work(CURL *curl, uint8_t *target, uint8_t *header, uint8_t **b
 		fprintf(stderr, "Are you sure that siad is running?\n");
 		exit(1);
 	}
-	if (in.len < 174) {
-		fprintf(stderr, "curl did not receive enough bytes (got %zu, expected at least 174)\n", in.len);
+	if (in.len != 112) {
+		fprintf(stderr, "curl did not receive correct bytes (got %zu, expected 112)\n", in.len);
 		return 1;
 	}
 
 	// Copy data to return
-	*blocklen = in.len - 112;
-	*block = (uint8_t*)malloc(*blocklen);
 	memcpy(target, in.bytes,     32);
 	memcpy(header, in.bytes+32,  80);
-	memcpy(*block, in.bytes+112, in.len-112);
 
 	return 0;
 }
 
-void submit_block(CURL *curl, uint8_t *block, size_t blocklen) {
+void submit_header(CURL *curl, uint8_t *header) {
 	if (curl) {
 		CURLcode res;
-		curl_off_t numBytes = blocklen;
 
 		curl_easy_reset(curl);
 		curl_easy_setopt(curl, CURLOPT_URL, submit_url);
 		curl_easy_setopt(curl, CURLOPT_POST, 1);
-		curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, numBytes);
-		curl_easy_setopt(curl, CURLOPT_POSTFIELDS, block);
+		curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, 80);
+		curl_easy_setopt(curl, CURLOPT_POSTFIELDS, header);
 		// Prevent printing to stdout
 		curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writefunc);
 		curl_easy_setopt(curl, CURLOPT_WRITEDATA, NULL);
diff --git a/network.h b/network.h
index b977ca5..73bc8bf 100644
--- a/network.h
+++ b/network.h
@@ -2,5 +2,5 @@
 #include <curl/curl.h>
 
 void set_port(char *port);
-int get_block_for_work(CURL *curl, uint8_t *target, uint8_t *header, uint8_t **block, size_t *blocklen);
-void submit_block(CURL *curl, uint8_t *block, size_t blocklen);
\ No newline at end of file
+int get_header_for_work(CURL *curl, uint8_t *target, uint8_t *header);
+void submit_header(CURL *curl, uint8_t *header);

From 1ea426c89a5e084981edb25a01a54e7da0720a64 Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 11:18:07 -0400
Subject: [PATCH 02/10] minor improvements

---
 gpu-miner.c  |  2 +-
 gpu-miner.cl | 36 ++++++++++++++----------------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/gpu-miner.c b/gpu-miner.c
index eee6b14..fd7db73 100644
--- a/gpu-miner.c
+++ b/gpu-miner.c
@@ -69,7 +69,7 @@ double grindNonces(size_t items_per_iter, int cycles_per_iter) {
 	}
 
 	// Check for target corruption
-	if (target[0] != 0 || target[1] != 0 || target[2] != 0 || target[3] != 0) {
+	if (target[0] != 0 || target[1] != 0) {
 		if (target_corrupt_flag) {
 			return -1;
 		}
diff --git a/gpu-miner.cl b/gpu-miner.cl
index 40d779f..7baf01f 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -7,11 +7,13 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	private uchar target[32];
 	headerHash[0] = 255;
 
-	int i, z;
+	int i;
+#pragma unroll
 	for (i = 0; i < 32; i++) {
 		target[i] = targ[i];
 		header[i] = headerIn[i];
 	}
+#pragma unroll
 	for (i = 32; i < 80; i++) {
 		header[i] = headerIn[i];
 	}
@@ -28,7 +30,7 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	blake2b(headerHash, header);
 
 	// Compare header to target
-	z = 0;
+	int z = 0;
 	while (target[z] == headerHash[z]) {
 		z++;
 	}
@@ -57,6 +59,7 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t
 	int i = 0 ;
 	char *dest8 = (char*)dest;
 	char *src8 = (char*)src;
+#pragma unroll
 	for (int i = 0; i < num; i++) {
 		dest8[i] = src8[i];
 	}
@@ -68,26 +71,15 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif
 
-  enum blake2b_constant
-  {
-	BLAKE2B_BLOCKBYTES = 128,
-	BLAKE2B_OUTBYTES   = 64,
-	BLAKE2B_KEYBYTES   = 64,
-	BLAKE2B_SALTBYTES  = 16,
-	BLAKE2B_PERSONALBYTES = 16
-  };
-
-#pragma pack(push, 1)
-  ALIGN( 64 ) typedef struct __blake2b_state
+  typedef struct __blake2b_state
   {
 	ulong h[8];
 	ulong t[2];
 	ulong f[2];
-	uchar  buf[2 * BLAKE2B_BLOCKBYTES];
+	uchar  buf[256];
 	size_t   buflen;
 	uchar  last_node;
   } blake2b_state;
-#pragma pack(pop)
 
   // Streaming API
   int blake2b_update( __private blake2b_state *S, __private const uchar *in, __private ulong inlen );
@@ -133,7 +125,7 @@ __constant uchar blake2b_sigma[12][16] =
 	{ 14, 10,	4,	8,	9, 15, 13,	6,	1, 12,	0,	2, 11,	7,	5,	3 }
 };
 
-static int blake2b_compress( __private blake2b_state *S, __private const uchar block[BLAKE2B_BLOCKBYTES] )
+static int blake2b_compress( __private blake2b_state *S, __private const uchar block[128] )
 {
 	ulong m[16];
 	ulong v[16];
@@ -207,17 +199,17 @@ int blake2b( __private uchar *out, __private uchar *in )
 
 	ulong inlen = 80;
 	size_t left = S->buflen;
-	size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
+	size_t fill = 2 * 128 - left;
 
 	if( inlen > fill )
 	{
 		clmemcpy( S->buf + left, in, fill ); // Fill buffer
 		S->buflen += fill;
 		blake2b_compress( S, S->buf ); // Compress
-		clmemcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
-		S->buflen -= BLAKE2B_BLOCKBYTES;
+		clmemcpy( S->buf, S->buf + 128, 128 ); // Shift buffer left
+		S->buflen -= 128;
 	}
-	else // inlen <= fill
+	else
 	{
 		clmemcpy( S->buf + left, in, inlen );
 		S->buflen += inlen; // Be lazy, do not compress
@@ -226,10 +218,10 @@ int blake2b( __private uchar *out, __private uchar *in )
 
 	S->t[0] += S->buflen;
 	S->f[0] = ~((ulong)0);
-	clmemset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); // Padding
+	clmemset( S->buf + S->buflen, 0, 2 * 128 - S->buflen ); // Padding
 	blake2b_compress( S, S->buf );
 
-	uchar buffer[BLAKE2B_OUTBYTES];
+	uchar buffer[64];
 	for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer
 		store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
 

From d50cd1e4be60e04e79791adb28c544d3561585c6 Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 11:47:00 -0400
Subject: [PATCH 03/10] stubborn inlining of everything

---
 gpu-miner.cl | 59 +++++++++++++++-------------------------------------
 1 file changed, 17 insertions(+), 42 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index 7baf01f..caeb69b 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -2,7 +2,7 @@ int blake2b( uchar *out, uchar *in );
 
 // The kernel that grinds nonces until it finds a hash below the target
 __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) {
-	private uchar header[80];
+	private uchar header[256];
 	private uchar headerHash[32];
 	private uchar target[32];
 	headerHash[0] = 255;
@@ -77,14 +77,9 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t
 	ulong t[2];
 	ulong f[2];
 	uchar  buf[256];
-	size_t   buflen;
 	uchar  last_node;
   } blake2b_state;
 
-  // Streaming API
-  int blake2b_update( __private blake2b_state *S, __private const uchar *in, __private ulong inlen );
-  int blake2b_final( __private blake2b_state *S, __private uchar *out );
-
 static inline ulong load64( __private const void *src )
 {
   return *( ulong * )( src );
@@ -127,12 +122,27 @@ __constant uchar blake2b_sigma[12][16] =
 
 static int blake2b_compress( __private blake2b_state *S, __private const uchar block[128] )
 {
+	return 0;
+}
+
+int blake2b( __private uchar *out, __private uchar *in )
+{
+	// Initialize a state.
+	private blake2b_state S[1];
+	clmemset( S, 0, sizeof( blake2b_state ) );
+	for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
+	S->h[0] ^= 0x0000000001010020UL;
+
+	S->t[0] += 80;
+	S->f[0] = ~((ulong)0);
+	clmemset( in + 80, 0, 2 * 128 - 80 ); // Padding
+
 	ulong m[16];
 	ulong v[16];
 	int i;
 
 	for( i = 0; i < 16; ++i )
-		m[i] = load64( block + i * sizeof( m[i] ) );
+		m[i] = load64( in + i * sizeof( m[i] ) );
 
 	for( i = 0; i < 8; ++i )
 		v[i] = S->h[i];
@@ -185,41 +195,6 @@ static int blake2b_compress( __private blake2b_state *S, __private const uchar b
 
 #undef G
 #undef ROUND
-	return 0;
-}
-
-// inlen, at least, should be ulong. Others can be size_t.
-int blake2b( __private uchar *out, __private uchar *in )
-{
-	private blake2b_state S[1];
-
-	clmemset( S, 0, sizeof( blake2b_state ) );
-	for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
-	S->h[0] ^= 0x0000000001010020UL;
-
-	ulong inlen = 80;
-	size_t left = S->buflen;
-	size_t fill = 2 * 128 - left;
-
-	if( inlen > fill )
-	{
-		clmemcpy( S->buf + left, in, fill ); // Fill buffer
-		S->buflen += fill;
-		blake2b_compress( S, S->buf ); // Compress
-		clmemcpy( S->buf, S->buf + 128, 128 ); // Shift buffer left
-		S->buflen -= 128;
-	}
-	else
-	{
-		clmemcpy( S->buf + left, in, inlen );
-		S->buflen += inlen; // Be lazy, do not compress
-	}
-
-
-	S->t[0] += S->buflen;
-	S->f[0] = ~((ulong)0);
-	clmemset( S->buf + S->buflen, 0, 2 * 128 - S->buflen ); // Padding
-	blake2b_compress( S, S->buf );
 
 	uchar buffer[64];
 	for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer

From b1a9b2989ecddcd9869c9d6cb2e0297db150b108 Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 11:49:44 -0400
Subject: [PATCH 04/10] clear extra memset

---
 gpu-miner.cl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index caeb69b..d1f468f 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -2,7 +2,7 @@ int blake2b( uchar *out, uchar *in );
 
 // The kernel that grinds nonces until it finds a hash below the target
 __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) {
-	private uchar header[256];
+	private uchar header[256] = {0};
 	private uchar headerHash[32];
 	private uchar target[32];
 	headerHash[0] = 255;
@@ -135,7 +135,6 @@ int blake2b( __private uchar *out, __private uchar *in )
 
 	S->t[0] += 80;
 	S->f[0] = ~((ulong)0);
-	clmemset( in + 80, 0, 2 * 128 - 80 ); // Padding
 
 	ulong m[16];
 	ulong v[16];

From 8dee5f831a98b12fbd92386baf49f0e727e22bab Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 11:59:21 -0400
Subject: [PATCH 05/10] inline blake2b

---
 gpu-miner.cl | 99 ++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 50 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index d1f468f..d0eadf2 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -1,51 +1,5 @@
 int blake2b( uchar *out, uchar *in );
 
-// The kernel that grinds nonces until it finds a hash below the target
-__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) {
-	private uchar header[256] = {0};
-	private uchar headerHash[32];
-	private uchar target[32];
-	headerHash[0] = 255;
-
-	int i;
-#pragma unroll
-	for (i = 0; i < 32; i++) {
-		target[i] = targ[i];
-		header[i] = headerIn[i];
-	}
-#pragma unroll
-	for (i = 32; i < 80; i++) {
-		header[i] = headerIn[i];
-	}
-
-	// Set nonce
-	private int id = get_global_id(0);
-	// Support global work sizes of up to 256^4 - 1
-	header[32] = id / (256 * 256 * 256);
-	header[33] = id / (256 * 256);
-	header[34] = id / 256;
-	header[35] = id % 256;
-
-	// Hash the header
-	blake2b(headerHash, header);
-
-	// Compare header to target
-	int z = 0;
-	while (target[z] == headerHash[z]) {
-		z++;
-	}
-	if (headerHash[z] < target[z]) {
-		// Transfer the output to global space.
-		for (i = 0; i < 8; i++) {
-			nonceOut[i] = header[i + 32];
-		}
-		for (i = 0; i < 32; i++) {
-			hashOut[i] = headerHash[i];
-		}
-		return;
-	}
-}
-
 // Implementations of clmemset and memcopy
 void *clmemset( __private void *s, __private int c, __private size_t n) {
 	uchar *p = s;
@@ -127,6 +81,38 @@ static int blake2b_compress( __private blake2b_state *S, __private const uchar b
 
 int blake2b( __private uchar *out, __private uchar *in )
 {
+
+	return 0;
+}
+
+// The kernel that grinds nonces until it finds a hash below the target
+__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) {
+	private uchar header[256] = {0};
+	private uchar headerHash[32];
+	private uchar target[32];
+	headerHash[0] = 255;
+
+	int i;
+#pragma unroll
+	for (i = 0; i < 32; i++) {
+		target[i] = targ[i];
+		header[i] = headerIn[i];
+	}
+#pragma unroll
+	for (i = 32; i < 80; i++) {
+		header[i] = headerIn[i];
+	}
+
+	// Set nonce
+	private int id = get_global_id(0);
+	// Support global work sizes of up to 256^4 - 1
+	header[32] = id / (256 * 256 * 256);
+	header[33] = id / (256 * 256);
+	header[34] = id / 256;
+	header[35] = id % 256;
+
+	// Hash the header
+	// blake2b(headerHash, header);
 	// Initialize a state.
 	private blake2b_state S[1];
 	clmemset( S, 0, sizeof( blake2b_state ) );
@@ -138,10 +124,9 @@ int blake2b( __private uchar *out, __private uchar *in )
 
 	ulong m[16];
 	ulong v[16];
-	int i;
 
 	for( i = 0; i < 16; ++i )
-		m[i] = load64( in + i * sizeof( m[i] ) );
+		m[i] = load64( header + i * sizeof( m[i] ) );
 
 	for( i = 0; i < 8; ++i )
 		v[i] = S->h[i];
@@ -199,7 +184,21 @@ int blake2b( __private uchar *out, __private uchar *in )
 	for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer
 		store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
 
-	clmemcpy( out, buffer, 32 );
+	clmemcpy( headerHash, buffer, 32 );
 
-	return 0;
+	// Compare header to target
+	int z = 0;
+	while (target[z] == headerHash[z]) {
+		z++;
+	}
+	if (headerHash[z] < target[z]) {
+		// Transfer the output to global space.
+		for (i = 0; i < 8; i++) {
+			nonceOut[i] = header[i + 32];
+		}
+		for (i = 0; i < 32; i++) {
+			hashOut[i] = headerHash[i];
+		}
+		return;
+	}
 }

From 369fbfe9357991004090269c6d4f6b2110c4f36e Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 12:04:23 -0400
Subject: [PATCH 06/10] memcpy to headerHash

---
 gpu-miner.cl | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index d0eadf2..83994fc 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -1,5 +1,3 @@
-int blake2b( uchar *out, uchar *in );
-
 // Implementations of clmemset and memcopy
 void *clmemset( __private void *s, __private int c, __private size_t n) {
 	uchar *p = s;
@@ -74,17 +72,6 @@ __constant uchar blake2b_sigma[12][16] =
 	{ 14, 10,	4,	8,	9, 15, 13,	6,	1, 12,	0,	2, 11,	7,	5,	3 }
 };
 
-static int blake2b_compress( __private blake2b_state *S, __private const uchar block[128] )
-{
-	return 0;
-}
-
-int blake2b( __private uchar *out, __private uchar *in )
-{
-
-	return 0;
-}
-
 // The kernel that grinds nonces until it finds a hash below the target
 __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) {
 	private uchar header[256] = {0};

From 4324c4527d7af364a114144ef75ed64eed61d47b Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 12:15:14 -0400
Subject: [PATCH 07/10] toss blake2b state struct

---
 gpu-miner.cl | 60 ++++++++++++++++++----------------------------------
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index 83994fc..29b8f8c 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -1,12 +1,3 @@
-// Implementations of clmemset and memcopy
-void *clmemset( __private void *s, __private int c, __private size_t n) {
-	uchar *p = s;
-	while(n--) {
-		*p++ = (uchar)c;
-	}
-	return s;
-}
-
 void clmemcpy( __private void *dest, __private const void *src, __private size_t num) {
 	int i = 0 ;
 	char *dest8 = (char*)dest;
@@ -23,14 +14,6 @@ void clmemcpy( __private void *dest, __private const void *src, __private size_t
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif
 
-  typedef struct __blake2b_state
-  {
-	ulong h[8];
-	ulong t[2];
-	ulong f[2];
-	uchar  buf[256];
-	uchar  last_node;
-  } blake2b_state;
 
 static inline ulong load64( __private const void *src )
 {
@@ -73,16 +56,16 @@ __constant uchar blake2b_sigma[12][16] =
 };
 
 // The kernel that grinds nonces until it finds a hash below the target
-__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targ, __global uchar *nonceOut) {
+__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targetIn, __global uchar *nonceOut) {
 	private uchar header[256] = {0};
-	private uchar headerHash[32];
+	private uchar headerHash[64];
 	private uchar target[32];
 	headerHash[0] = 255;
 
 	int i;
 #pragma unroll
 	for (i = 0; i < 32; i++) {
-		target[i] = targ[i];
+		target[i] = targetIn[i];
 		header[i] = headerIn[i];
 	}
 #pragma unroll
@@ -98,16 +81,13 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	header[34] = id / 256;
 	header[35] = id % 256;
 
-	// Hash the header
-	// blake2b(headerHash, header);
-	// Initialize a state.
-	private blake2b_state S[1];
-	clmemset( S, 0, sizeof( blake2b_state ) );
-	for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
-	S->h[0] ^= 0x0000000001010020UL;
-
-	S->t[0] += 80;
-	S->f[0] = ~((ulong)0);
+	// BLAKE2B START
+	ulong h[8] = {0};
+	ulong t[2] = {80, 0};
+	ulong f[2] = {0};
+	for( int i = 0; i < 8; ++i ) h[i] = blake2b_IV[i];
+	h[0] ^= 0x0000000001010020UL;
+	f[0] = ~((ulong)0);
 
 	ulong m[16];
 	ulong v[16];
@@ -116,16 +96,16 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 		m[i] = load64( header + i * sizeof( m[i] ) );
 
 	for( i = 0; i < 8; ++i )
-		v[i] = S->h[i];
+		v[i] = h[i];
 
 	v[ 8] = blake2b_IV[0];
 	v[ 9] = blake2b_IV[1];
 	v[10] = blake2b_IV[2];
 	v[11] = blake2b_IV[3];
-	v[12] = S->t[0] ^ blake2b_IV[4];
-	v[13] = S->t[1] ^ blake2b_IV[5];
-	v[14] = S->f[0] ^ blake2b_IV[6];
-	v[15] = S->f[1] ^ blake2b_IV[7];
+	v[12] = t[0] ^ blake2b_IV[4];
+	v[13] = t[1] ^ blake2b_IV[5];
+	v[14] = f[0] ^ blake2b_IV[6];
+	v[15] = f[1] ^ blake2b_IV[7];
 #define G(r,i,a,b,c,d) \
 	do { \
 		a = a + b + m[blake2b_sigma[r][2*i+0]]; \
@@ -162,16 +142,14 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	ROUND( 11 );
 
 	for( i = 0; i < 8; ++i )
-		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+		h[i] = h[i] ^ v[i] ^ v[i + 8];
 
 #undef G
 #undef ROUND
 
-	uchar buffer[64];
 	for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer
-		store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
-
-	clmemcpy( headerHash, buffer, 32 );
+		store64( headerHash + sizeof( h[i] ) * i, h[i] );
+	// BLAKE2B END
 
 	// Compare header to target
 	int z = 0;
@@ -180,9 +158,11 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	}
 	if (headerHash[z] < target[z]) {
 		// Transfer the output to global space.
+#pragma unroll
 		for (i = 0; i < 8; i++) {
 			nonceOut[i] = header[i + 32];
 		}
+#pragma unroll
 		for (i = 0; i < 32; i++) {
 			hashOut[i] = headerHash[i];
 		}

From eec327dce41acb900eb53889735fcd792dfeb89b Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 12:41:15 -0400
Subject: [PATCH 08/10] smoosh things

---
 gpu-miner.cl | 103 ++++++++++++++++-----------------------------------
 1 file changed, 32 insertions(+), 71 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index 29b8f8c..7013fbd 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -1,20 +1,3 @@
-void clmemcpy( __private void *dest, __private const void *src, __private size_t num) {
-	int i = 0 ;
-	char *dest8 = (char*)dest;
-	char *src8 = (char*)src;
-#pragma unroll
-	for (int i = 0; i < num; i++) {
-		dest8[i] = src8[i];
-	}
-}
-
-#if defined(_MSC_VER)
-#define ALIGN(x) __declspec(align(x))
-#else
-#define ALIGN(x) __attribute__((aligned(x)))
-#endif
-
-
 static inline ulong load64( __private const void *src )
 {
   return *( ulong * )( src );
@@ -29,16 +12,6 @@ static inline ulong rotr64( __private const ulong w, __private const unsigned c
 {
   return ( w >> c ) | ( w << ( 64 - c ) );
 }
-
-// blake2b-ref.c
-__constant ulong blake2b_IV[8] =
-{
-	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-};
-
 __constant uchar blake2b_sigma[12][16] =
 {
 	{	0,	1,	2,	3,	4,	5,	6,	7,	8,	9, 10, 11, 12, 13, 14, 15 } ,
@@ -60,8 +33,8 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	private uchar header[256] = {0};
 	private uchar headerHash[64];
 	private uchar target[32];
-	headerHash[0] = 255;
 
+	// Transfer inputs from global memory
 	int i;
 #pragma unroll
 	for (i = 0; i < 32; i++) {
@@ -82,52 +55,43 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	header[35] = id % 256;
 
 	// BLAKE2B START
-	ulong h[8] = {0};
+	ulong iv[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 };
+	ulong v[16];
 	ulong t[2] = {80, 0};
-	ulong f[2] = {0};
-	for( int i = 0; i < 8; ++i ) h[i] = blake2b_IV[i];
-	h[0] ^= 0x0000000001010020UL;
-	f[0] = ~((ulong)0);
-
+	ulong f[2] = {~0, 0};
 	ulong m[16];
-	ulong v[16];
-
 	for( i = 0; i < 16; ++i )
 		m[i] = load64( header + i * sizeof( m[i] ) );
-
+	v[ 8] = iv[0];
+	v[ 9] = iv[1];
+	v[10] = iv[2];
+	v[11] = iv[3];
+	v[12] = t[0] ^ iv[4];
+	v[13] = t[1] ^ iv[5];
+	v[14] = f[0] ^ iv[6];
+	v[15] = f[1] ^ iv[7];
+	iv[0] ^= 0x0000000001010020UL;
 	for( i = 0; i < 8; ++i )
-		v[i] = h[i];
+		v[i] = iv[i];
 
-	v[ 8] = blake2b_IV[0];
-	v[ 9] = blake2b_IV[1];
-	v[10] = blake2b_IV[2];
-	v[11] = blake2b_IV[3];
-	v[12] = t[0] ^ blake2b_IV[4];
-	v[13] = t[1] ^ blake2b_IV[5];
-	v[14] = f[0] ^ blake2b_IV[6];
-	v[15] = f[1] ^ blake2b_IV[7];
 #define G(r,i,a,b,c,d) \
-	do { \
-		a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-		d = rotr64(d ^ a, 32); \
-		c = c + d; \
-		b = rotr64(b ^ c, 24); \
-		a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-		d = rotr64(d ^ a, 16); \
-		c = c + d; \
-		b = rotr64(b ^ c, 63); \
-	} while(0)
+	a = a + b + m[blake2b_sigma[r][2*i+0]]; \
+	d = rotr64(d ^ a, 32); \
+	c = c + d; \
+	b = rotr64(b ^ c, 24); \
+	a = a + b + m[blake2b_sigma[r][2*i+1]]; \
+	d = rotr64(d ^ a, 16); \
+	c = c + d; \
+	b = rotr64(b ^ c, 63);
 #define ROUND(r)	\
-	do { \
-		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-	} while(0)
+	G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+	G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+	G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+	G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+	G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+	G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+	G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+	G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
 	ROUND( 0 );
 	ROUND( 1 );
 	ROUND( 2 );
@@ -140,15 +104,12 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	ROUND( 9 );
 	ROUND( 10 );
 	ROUND( 11 );
-
 	for( i = 0; i < 8; ++i )
-		h[i] = h[i] ^ v[i] ^ v[i + 8];
-
+		iv[i] = iv[i] ^ v[i] ^ v[i + 8];
 #undef G
 #undef ROUND
-
 	for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer
-		store64( headerHash + sizeof( h[i] ) * i, h[i] );
+		store64( headerHash + sizeof( iv[i] ) * i, iv[i] );
 	// BLAKE2B END
 
 	// Compare header to target

From fb82351d541729792a734c4d476a8dddb0ee9a3e Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 13:15:43 -0400
Subject: [PATCH 09/10] merge memory structures

---
 gpu-miner.cl | 90 +++++++++++++++++++++++-----------------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index 7013fbd..d9e0b9d 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -1,37 +1,11 @@
-static inline ulong load64( __private const void *src )
-{
-  return *( ulong * )( src );
-}
-
-static inline void store64( __private void *dst, __private ulong w )
-{
-  *( ulong * )( dst ) = w;
-}
-
 static inline ulong rotr64( __private const ulong w, __private const unsigned c )
 {
   return ( w >> c ) | ( w << ( 64 - c ) );
 }
-__constant uchar blake2b_sigma[12][16] =
-{
-	{	0,	1,	2,	3,	4,	5,	6,	7,	8,	9, 10, 11, 12, 13, 14, 15 } ,
-	{ 14, 10,	4,	8,	9, 15, 13,	6,	1, 12,	0,	2, 11,	7,	5,	3 } ,
-	{ 11,	8, 12,	0,	5,	2, 15, 13, 10, 14,	3,	6,	7,	1,	9,	4 } ,
-	{	7,	9,	3,	1, 13, 12, 11, 14,	2,	6,	5, 10,	4,	0, 15,	8 } ,
-	{	9,	0,	5,	7,	2,	4, 10, 15, 14,	1, 11, 12,	6,	8,	3, 13 } ,
-	{	2, 12,	6, 10,	0, 11,	8,	3,	4, 13,	7,	5, 15, 14,	1,	9 } ,
-	{ 12,	5,	1, 15, 14, 13,	4, 10,	0,	7,	6,	3,	9,	2,	8, 11 } ,
-	{ 13, 11,	7, 14, 12,	1,	3,	9,	5,	0, 15,	4,	8,	6,	2, 10 } ,
-	{	6, 15, 14,	9, 11,	3,	0,	8, 12,	2, 13,	7,	1,	4, 10,	5 } ,
-	{ 10,	2,	8,	4,	7,	6,	1,	5, 15, 11,	9, 14,	3, 12, 13 , 0 } ,
-	{	0,	1,	2,	3,	4,	5,	6,	7,	8,	9, 10, 11, 12, 13, 14, 15 } ,
-	{ 14, 10,	4,	8,	9, 15, 13,	6,	1, 12,	0,	2, 11,	7,	5,	3 }
-};
 
 // The kernel that grinds nonces until it finds a hash below the target
 __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targetIn, __global uchar *nonceOut) {
 	private uchar header[256] = {0};
-	private uchar headerHash[64];
 	private uchar target[32];
 
 	// Transfer inputs from global memory
@@ -48,32 +22,26 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 
 	// Set nonce
 	private int id = get_global_id(0);
-	// Support global work sizes of up to 256^4 - 1
 	header[32] = id / (256 * 256 * 256);
 	header[33] = id / (256 * 256);
 	header[34] = id / 256;
 	header[35] = id % 256;
 
-	// BLAKE2B START
-	ulong iv[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 };
-	ulong v[16];
-	ulong t[2] = {80, 0};
-	ulong f[2] = {~0, 0};
-	ulong m[16];
-	for( i = 0; i < 16; ++i )
-		m[i] = load64( header + i * sizeof( m[i] ) );
-	v[ 8] = iv[0];
-	v[ 9] = iv[1];
-	v[10] = iv[2];
-	v[11] = iv[3];
-	v[12] = t[0] ^ iv[4];
-	v[13] = t[1] ^ iv[5];
-	v[14] = f[0] ^ iv[6];
-	v[15] = f[1] ^ iv[7];
-	iv[0] ^= 0x0000000001010020UL;
-	for( i = 0; i < 8; ++i )
-		v[i] = iv[i];
-
+	uchar blake2b_sigma[12][16] =
+	{
+		{	0,	1,	2,	3,	4,	5,	6,	7,	8,	9, 10, 11, 12, 13, 14, 15 } ,
+		{ 14, 10,	4,	8,	9, 15, 13,	6,	1, 12,	0,	2, 11,	7,	5,	3 } ,
+		{ 11,	8, 12,	0,	5,	2, 15, 13, 10, 14,	3,	6,	7,	1,	9,	4 } ,
+		{	7,	9,	3,	1, 13, 12, 11, 14,	2,	6,	5, 10,	4,	0, 15,	8 } ,
+		{	9,	0,	5,	7,	2,	4, 10, 15, 14,	1, 11, 12,	6,	8,	3, 13 } ,
+		{	2, 12,	6, 10,	0, 11,	8,	3,	4, 13,	7,	5, 15, 14,	1,	9 } ,
+		{ 12,	5,	1, 15, 14, 13,	4, 10,	0,	7,	6,	3,	9,	2,	8, 11 } ,
+		{ 13, 11,	7, 14, 12,	1,	3,	9,	5,	0, 15,	4,	8,	6,	2, 10 } ,
+		{	6, 15, 14,	9, 11,	3,	0,	8, 12,	2, 13,	7,	1,	4, 10,	5 } ,
+		{ 10,	2,	8,	4,	7,	6,	1,	5, 15, 11,	9, 14,	3, 12, 13 , 0 } ,
+		{	0,	1,	2,	3,	4,	5,	6,	7,	8,	9, 10, 11, 12, 13, 14, 15 } ,
+		{ 14, 10,	4,	8,	9, 15, 13,	6,	1, 12,	0,	2, 11,	7,	5,	3 }
+	};
 #define G(r,i,a,b,c,d) \
 	a = a + b + m[blake2b_sigma[r][2*i+0]]; \
 	d = rotr64(d ^ a, 32); \
@@ -92,6 +60,26 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	G(r,5,v[ 1],v[ 6],v[11],v[12]); \
 	G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
 	G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+	// BLAKE2B START
+
+	ulong iv[8] = { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 };
+	ulong v[16];
+	ulong t[2] = {80, 0};
+	ulong f[2] = {~0, 0};
+	ulong m[16];
+	for( i = 0; i < 16; ++i )
+		m[i] = *(ulong*)( header + i * 8 );
+	v[ 8] = iv[0];
+	v[ 9] = iv[1];
+	v[10] = iv[2];
+	v[11] = iv[3];
+	v[12] = t[0] ^ iv[4];
+	v[13] = t[1] ^ iv[5];
+	v[14] = f[0] ^ iv[6];
+	v[15] = f[1] ^ iv[7];
+	iv[0] ^= 0x0000000001010020UL;
+	for( i = 0; i < 8; ++i )
+		v[i] = iv[i];
 	ROUND( 0 );
 	ROUND( 1 );
 	ROUND( 2 );
@@ -104,13 +92,15 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	ROUND( 9 );
 	ROUND( 10 );
 	ROUND( 11 );
+	private uchar headerHash[64];
 	for( i = 0; i < 8; ++i )
 		iv[i] = iv[i] ^ v[i] ^ v[i + 8];
-#undef G
-#undef ROUND
 	for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer
-		store64( headerHash + sizeof( iv[i] ) * i, iv[i] );
+		*(ulong*)(headerHash + 8 * i) = iv[i];
+
 	// BLAKE2B END
+#undef G
+#undef ROUND
 
 	// Compare header to target
 	int z = 0;

From e7c9524b5075a68aa2b00eeaedf5102717b5a3a9 Mon Sep 17 00:00:00 2001
From: David Vorick <david.vorick@gmail.com>
Date: Tue, 9 Jun 2015 14:28:18 -0400
Subject: [PATCH 10/10] 32bit magic

---
 gpu-miner.cl | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/gpu-miner.cl b/gpu-miner.cl
index d9e0b9d..92f9ec7 100644
--- a/gpu-miner.cl
+++ b/gpu-miner.cl
@@ -1,31 +1,24 @@
-static inline ulong rotr64( __private const ulong w, __private const unsigned c )
+static inline ulong rotr64( __const ulong w, __const unsigned c )
 {
   return ( w >> c ) | ( w << ( 64 - c ) );
 }
 
 // The kernel that grinds nonces until it finds a hash below the target
-__kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __global uchar *targetIn, __global uchar *nonceOut) {
-	private uchar header[256] = {0};
-	private uchar target[32];
+__kernel void nonceGrind(__global uint *headerIn, __global uchar *hashOut, __global uint *targetIn, __global uchar *nonceOut) {
+	uchar header[256] = {0};
+	uchar target[32];
 
 	// Transfer inputs from global memory
 	int i;
-#pragma unroll
-	for (i = 0; i < 32; i++) {
-		target[i] = targetIn[i];
-		header[i] = headerIn[i];
+	for (i = 0; i < 8; i++) {
+		*(uint*)(target + i * 4) = targetIn[i];
 	}
-#pragma unroll
-	for (i = 32; i < 80; i++) {
-		header[i] = headerIn[i];
+	for (i = 0; i < 20; i++) {
+		*(uint*)(header + i * 4) = headerIn[i];
 	}
 
 	// Set nonce
-	private int id = get_global_id(0);
-	header[32] = id / (256 * 256 * 256);
-	header[33] = id / (256 * 256);
-	header[34] = id / 256;
-	header[35] = id % 256;
+	*(uint*)(header + 32) = get_global_id(0);
 
 	uchar blake2b_sigma[12][16] =
 	{
@@ -92,7 +85,7 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 	ROUND( 9 );
 	ROUND( 10 );
 	ROUND( 11 );
-	private uchar headerHash[64];
+	uchar headerHash[64];
 	for( i = 0; i < 8; ++i )
 		iv[i] = iv[i] ^ v[i] ^ v[i + 8];
 	for( int i = 0; i < 8; ++i ) // Output full hash to temp buffer
@@ -102,18 +95,16 @@ __kernel void nonceGrind(__global uchar *headerIn, __global uchar *hashOut, __gl
 #undef G
 #undef ROUND
 
-	// Compare header to target
-	int z = 0;
-	while (target[z] == headerHash[z]) {
-		z++;
+	// Compare hash to target
+	i = 0;
+	while (target[i] == headerHash[i]) {
+		i++;
 	}
-	if (headerHash[z] < target[z]) {
+	if (headerHash[i] < target[i]) {
 		// Transfer the output to global space.
-#pragma unroll
 		for (i = 0; i < 8; i++) {
 			nonceOut[i] = header[i + 32];
 		}
-#pragma unroll
 		for (i = 0; i < 32; i++) {
 			hashOut[i] = headerHash[i];
 		}