Simplify the output part of the kernel. There's no demonstrable advantage from more complexity.
diff --git a/phatk110722.cl b/phatk110722.cl
index 43cdacf..77379d1 100644
--- a/phatk110722.cl
+++ b/phatk110722.cl
@@ -394,26 +394,26 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
#if defined(VECTORS4) || defined(VECTORS2)
if (Vals[7].x == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & (W[3].x >> 2)] = W_3.x;
+ output[MAXBUFFERS] = output[NFLAG & W_3.x] = W_3.x;
}
if (Vals[7].y == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & (W[3].y >> 2)] = W_3.y;
+ output[MAXBUFFERS] = output[NFLAG & W_3.y] = W_3.y;
}
#ifdef VECTORS4
if (Vals[7].z == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & (W[3].z >> 2)] = W_3.z;
+ output[MAXBUFFERS] = output[NFLAG & W_3.z] = W_3.z;
}
if (Vals[7].w == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & (W[3].w >> 2)] = W_3.w;
+ output[MAXBUFFERS] = output[NFLAG & W_3.w] = W_3.w;
}
#endif
#else
if (Vals[7] == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & (W[3] >> 2)] = W_3;
+ output[MAXBUFFERS] = output[NFLAG & W_3] = W_3;
}
#endif
}