Commit 033913ca8e1a2220cec1dd7e11a47508f1e59ae3

ckolivas 2012-02-08T15:40:10

First working port of the diakgcn kernel.

diff --git a/device-gpu.c b/device-gpu.c
index d40db48..e39bff4 100644
--- a/device-gpu.c
+++ b/device-gpu.c
@@ -673,39 +673,42 @@ void manage_gpu(void)
 #ifdef HAVE_OPENCL
 static _clState *clStates[MAX_GPUDEVICES];
 
+#define CL_SET_BLKARG(blkvar) status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->blkvar)
+#define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var)
+#define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var)
+
 static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk)
 {
 	cl_kernel *kernel = &clState->kernel;
 	cl_int status = 0;
 	int num = 0;
 
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce);
-
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2);
-
-	status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer),
-				 (void *)&clState->outputBuffer);
+	CL_SET_BLKARG(ctx_a);
+	CL_SET_BLKARG(ctx_b);
+	CL_SET_BLKARG(ctx_c);
+	CL_SET_BLKARG(ctx_d);
+	CL_SET_BLKARG(ctx_e);
+	CL_SET_BLKARG(ctx_f);
+	CL_SET_BLKARG(ctx_g);
+	CL_SET_BLKARG(ctx_h);
+	CL_SET_BLKARG(cty_b);
+	CL_SET_BLKARG(cty_c);
+	CL_SET_BLKARG(cty_d);
+	CL_SET_BLKARG(cty_f);
+	CL_SET_BLKARG(cty_g);
+	CL_SET_BLKARG(cty_h);
+	CL_SET_BLKARG(nonce);
+
+	CL_SET_BLKARG(fW0);
+	CL_SET_BLKARG(fW1);
+	CL_SET_BLKARG(fW2);
+	CL_SET_BLKARG(fW3);
+	CL_SET_BLKARG(fW15);
+	CL_SET_BLKARG(fW01r);
+	CL_SET_BLKARG(fcty_e);
+	CL_SET_BLKARG(fcty_e2);
+
+	CL_SET_ARG(clState->outputBuffer);
 
 	return status;
 }
@@ -718,75 +721,87 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk)
 	int i, num = 0;
 	uint *nonces;
 
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h);
-
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h);
+	CL_SET_BLKARG(ctx_a);
+	CL_SET_BLKARG(ctx_b);
+	CL_SET_BLKARG(ctx_c);
+	CL_SET_BLKARG(ctx_d);
+	CL_SET_BLKARG(ctx_e);
+	CL_SET_BLKARG(ctx_f);
+	CL_SET_BLKARG(ctx_g);
+	CL_SET_BLKARG(ctx_h);
+
+	CL_SET_BLKARG(cty_b);
+	CL_SET_BLKARG(cty_c);
+	CL_SET_BLKARG(cty_d);
+	CL_SET_BLKARG(cty_f);
+	CL_SET_BLKARG(cty_g);
+	CL_SET_BLKARG(cty_h);
 
 	nonces = alloca(sizeof(uint) * vwidth);
 	for (i = 0; i < vwidth; i++)
 		nonces[i] = blk->nonce + i;
 	status |= clSetKernelArg(*kernel, num++, vwidth * sizeof(uint), (void *)nonces);
 
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W16);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W17);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal4_2);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal0);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW18);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW19);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW31);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW32);
+	CL_SET_BLKARG(W16);
+	CL_SET_BLKARG(W17);
+	CL_SET_BLKARG(PreVal4_2);
+	CL_SET_BLKARG(PreVal0);
+	CL_SET_BLKARG(PreW18);
+	CL_SET_BLKARG(PreW19);
+	CL_SET_BLKARG(PreW31);
+	CL_SET_BLKARG(PreW32);
 
-	status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer),
-				 (void *)&clState->outputBuffer);
+	CL_SET_ARG(clState->outputBuffer);
 
 	return status;
 }
 
 static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk)
 {
+	cl_uint vwidth = clState->preferred_vwidth;
 	cl_kernel *kernel = &clState->kernel;
 	cl_int status = 0;
-	int num = 0;
+	int i, num = 0;
+	uint *nonces;
 
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce);
-
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e);
-	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2);
-
-	status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer),
-				 (void *)&clState->outputBuffer);
+	nonces = alloca(sizeof(uint) * vwidth);
+	for (i = 0; i < vwidth; i++)
+		nonces[i] = blk->nonce + i;
+	CL_SET_VARG(vwidth, nonces);
+
+	CL_SET_BLKARG(PreVal4);
+	CL_SET_BLKARG(cty_h);
+	CL_SET_BLKARG(cty_d);
+	CL_SET_BLKARG(PreVal0);
+	CL_SET_BLKARG(cty_b);
+	CL_SET_BLKARG(cty_c);
+	CL_SET_BLKARG(cty_f);
+	CL_SET_BLKARG(cty_g);
+	CL_SET_BLKARG(C1addK5);
+	CL_SET_BLKARG(B1addK6);
+	CL_SET_BLKARG(PreVal0addK7);
+	CL_SET_BLKARG(W16addK16);
+	CL_SET_BLKARG(W17addK17);
+	CL_SET_BLKARG(PreW18);
+	CL_SET_BLKARG(PreW19);
+	CL_SET_BLKARG(W16);
+	CL_SET_BLKARG(W17);
+	CL_SET_BLKARG(PreW31);
+	CL_SET_BLKARG(PreW32);
+
+	CL_SET_BLKARG(ctx_a);
+	CL_SET_BLKARG(ctx_b);
+	CL_SET_BLKARG(ctx_c);
+	CL_SET_BLKARG(ctx_d);
+	CL_SET_BLKARG(ctx_e);
+	CL_SET_BLKARG(ctx_f);
+	CL_SET_BLKARG(ctx_g);
+	CL_SET_BLKARG(ctx_h);
+
+	CL_SET_BLKARG(A0);
+	CL_SET_BLKARG(B0);
+
+	CL_SET_ARG(clState->outputBuffer);
 
 	return status;
 }
diff --git a/diakgcn120208.cl b/diakgcn120208.cl
index f8b263f..84f02e1 100644
--- a/diakgcn120208.cl
+++ b/diakgcn120208.cl
@@ -3,9 +3,7 @@
 // Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3.
 // The kernel was rewritten by me (Diapolo) and is still public-domain!
 
-#ifdef VECTORS8
-	typedef uint8 u;
-#elif defined VECTORS4
+#if defined VECTORS4
 	typedef uint4 u;
 #elif defined VECTORS2
 	typedef uint2 u;
@@ -31,9 +29,7 @@
 #ifdef GOFFSET
 	typedef uint uu;
 #else
-	#ifdef VECTORS8
-		typedef uint8 uu;
-	#elif defined VECTORS4
+	#if defined VECTORS4
 		typedef uint4 uu;
 	#elif defined VECTORS2
 		typedef uint2 uu;
@@ -67,29 +63,23 @@ __kernel
 	u W[17];
 	u V[8];
 
-#ifdef VECTORS8
+#if defined VECTORS4
 	#ifdef GOFFSET
-		u nonce = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7);
+		u nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3);
 	#else
-		u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 8U) + base;
-	#endif
-#elif defined VECTORS4
-	#ifdef GOFFSET
-		u nonce = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3);
-	#else
-		u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 4U) + base;
+		u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
 	#endif
 #elif defined VECTORS2
 	#ifdef GOFFSET
-		u nonce = ((uint)get_global_id(0) << 1) + (u)(0, 1);
+		u nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1);
 	#else
-		u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) * 2U) + ((uint)get_local_id(0) * 2U) + base;
+		u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
 	#endif
 #else
 	#ifdef GOFFSET
-		u nonce = (uint)get_global_id(0);
+		u nonce = base + get_global_id(0);
 	#else
-		u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + (uint)get_local_id(0) + base;
+		u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
 	#endif
 #endif
 
@@ -589,22 +579,40 @@ __kernel
 
 	V[7] += V[3] + W[10] + ch(124) + rot26(V[0]);
 
-#ifdef VECTORS8
-	u result = (u)(((V[7].s0 == 0x136032ed) * nonce.s0), ((V[7].s1 == 0x136032ed) * nonce.s1), ((V[7].s2 == 0x136032ed) * nonce.s2), ((V[7].s3 == 0x136032ed) * nonce.s3),
-				   ((V[7].s4 == 0x136032ed) * nonce.s4), ((V[7].s5 == 0x136032ed) * nonce.s5), ((V[7].s6 == 0x136032ed) * nonce.s6), ((V[7].s7 == 0x136032ed) * nonce.s7));
-	output[0 + (upsample(result.s0, result.s1) > 0)] = upsample(result.s0, result.s1);
-	output[2 + (upsample(result.s2, result.s3) > 1)] = upsample(result.s2, result.s3);
-	output[4 + (upsample(result.s4, result.s5) > 0)] = upsample(result.s4, result.s5);
-	output[6 + (upsample(result.s6, result.s7) > 1)] = upsample(result.s6, result.s7);
-#elif defined VECTORS4
-	u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y), ((V[7].z == 0x136032ed) * nonce.z), ((V[7].w == 0x136032ed) * nonce.w));
-	output[0 + (upsample(result.x, result.y) > 0)] = upsample(result.x, result.y);
-	output[2 + (upsample(result.z, result.w) > 1)] = upsample(result.z, result.w);
-#elif defined VECTORS2
-	u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y));
-	output[upsample(result.x, result.y) > 0] = upsample(result.x, result.y);
+
+#define FOUND (0x80)
+#define NFLAG (0x7F)
+
+#ifdef VECTORS4
+	V[7] ^= 0x136032ed;
+
+	bool result = V[7].x & V[7].y & V[7].z & V[7].w;
+
+	if (!result) {
+		if (!V[7].x)
+			output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
+		if (!V[7].y)
+			output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
+		if (!V[7].z)
+			output[FOUND] = output[NFLAG & W[3].z] = W[3].z;
+		if (!V[7].w)
+			output[FOUND] = output[NFLAG & W[3].w] = W[3].w;
+	}
 #else
-	u result = (V[7] == 0x136032ed) * nonce;
-	output[result != 0] = result;
+	#ifdef VECTORS2
+		V[7] ^= 0x136032ed;
+
+		bool result = V[7].x & V[7].y;
+
+		if (!result) {
+			if (!V[7].x)
+				output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
+			if (!V[7].y)
+				output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
+		}
+	#else
+		if (V[7] == 0x136032ed)
+			output[FOUND] = output[NFLAG & W[3]] = W[3];
+	#endif
 #endif
 }
diff --git a/findnonce.c b/findnonce.c
index 35fd14e..da9c4ec 100644
--- a/findnonce.c
+++ b/findnonce.c
@@ -115,7 +115,15 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
 
 
 	blk->PreVal4addT1 = blk->PreVal4 + blk->T1;
-	blk->T1substate0 = state[0] - blk->T1;
+	blk->T1substate0 = blk->ctx_a - blk->T1;
+
+	blk->B1addK6 = blk->cty_b + 0x923f82a4;
+	blk->PreVal0addK7 = blk->PreVal0 + 0xab1c5ed5;
+	blk->W16addK16 = blk->W16 + 0xe49b69c1;
+	blk->W17addK17 = blk->W17 + 0xefbe4786;
+
+	blk->A0 = blk->ctx_a + 0x98c7e2a2;
+	blk->B0 = blk->ctx_a + 0xfc08884d;
 }
 
 #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10)))
diff --git a/miner.h b/miner.h
index adedaaa..a13113c 100644
--- a/miner.h
+++ b/miner.h
@@ -610,6 +610,10 @@ typedef struct {
 	cl_uint PreW19;
 	cl_uint PreW31;
 	cl_uint PreW32;
+
+	/* For diakgcn */
+	cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17;
+	cl_uint A0, B0;
 } dev_blk_ctx;
 #else
 typedef struct {