revert to legacy nonce creation in the kernel without vector offset, but keep GOFFSET code removed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
diff --git a/device-gpu.c b/device-gpu.c
index 233e59d..a526c70 100644
--- a/device-gpu.c
+++ b/device-gpu.c
@@ -835,7 +835,8 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk,
return status;
}
-static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads)
+static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk,
+ __maybe_unused cl_uint threads)
{
cl_kernel *kernel = &clState->kernel;
cl_uint vwidth = clState->vwidth;
@@ -845,7 +846,7 @@ static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint
nonces = alloca(sizeof(uint) * vwidth);
for (i = 0; i < vwidth; i++)
- nonces[i] = blk->nonce + (i * threads);
+ nonces[i] = blk->nonce + i;
CL_SET_VARG(vwidth, nonces);
CL_SET_BLKARG(PreVal0);
diff --git a/diakgcn120222.cl b/diakgcn120222.cl
index fa98dbd..89421a2 100644
--- a/diakgcn120222.cl
+++ b/diakgcn120222.cl
@@ -53,7 +53,15 @@ __kernel
u V[8];
u W[16];
- const u nonce = base + (uint)get_global_id(0);
+#ifdef VECTORS8
+ const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) << 3) + ((uint)get_local_id(0) << 3) + base;
+#elif defined VECTORS4
+ const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) << 2) + ((uint)get_local_id(0) << 2) + base;
+#elif defined VECTORS2
+ const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) << 1) + ((uint)get_local_id(0) << 1) + base;
+#else
+ const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0)) + (uint)get_local_id(0) + base;
+#endif
V[0] = PreVal0 + nonce;
V[1] = B1;
@@ -108,7 +116,16 @@ __kernel
//----------------------------------------------------------------------------------
+#ifdef VECTORS8
+ W[0] = PreW18 + (u)( rotr25(nonce.s0), rotr25(nonce.s0) ^ 0x2004000U, rotr25(nonce.s0) ^ 0x4008000U, rotr25(nonce.s0) ^ 0x600c000U,
+ rotr25(nonce.s0) ^ 0x8010000U, rotr25(nonce.s0) ^ 0xa014000U, rotr25(nonce.s0) ^ 0xc018000U, rotr25(nonce.s0) ^ 0xe01c000U);
+#elif defined VECTORS4
+ W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U, rotr25(nonce.x) ^ 0x4008000U, rotr25(nonce.x) ^ 0x600c000U);
+#elif defined VECTORS2
+ W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U);
+#else
W[0] = PreW18 + rotr25(nonce);
+#endif
W[1] = PreW19 + nonce;
W[2] = 0x80000000U + rotr15(W[0]);
W[3] = rotr15(W[1]);