Move phatk kernel to offset vector based nonce bases as well.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
diff --git a/device-gpu.c b/device-gpu.c
index 2f0fc52..e92e8c8 100644
--- a/device-gpu.c
+++ b/device-gpu.c
@@ -819,7 +819,7 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk,
nonces = alloca(sizeof(uint) * vwidth);
for (i = 0; i < vwidth; i++)
- nonces[i] = blk->nonce + i;
+ nonces[i] = blk->nonce + (i * threads);
CL_SET_VARG(vwidth, nonces);
CL_SET_BLKARG(W16);
diff --git a/phatk120222.cl b/phatk120222.cl
index 5c89fb9..bb49ce1 100644
--- a/phatk120222.cl
+++ b/phatk120222.cl
@@ -168,8 +168,8 @@ void search( const uint state0, const uint state1, const uint state2, const uint
{
- u W[124];
- u Vals[8];
+ u W[132];
+ u *Vals=&W[124];
//Dummy Variable to prevent compiler from reordering between rounds
u t1;
@@ -186,23 +186,11 @@ void search( const uint state0, const uint state1, const uint state2, const uint
W[16] = W16;
W[17] = W17;
-#ifdef VECTORS4
- //Less dependencies to get both the local id and group id and then add them
- W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
- uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
- //Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
- W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
-#elif defined VECTORS2
- W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
- uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
- W[18] = PreW18 + (u){r, r ^ 0x2004000U};
-#else
- W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
+ W[3] = base + (uint)get_global_id(0);
u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
W[18] = PreW18 + r;
-#endif
- //the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions
+ //the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions
Vals[4] = PreVal4 + W[3];