Some hardware might benefit from the less OPS so there's no harm in leaving it there apart from readability of the code. Revert "Simplify the output part of the kernel. There's no demonstrable advantage from more complexity." This reverts commit 53b53cb666ff09dfd9034fc6f6987516d0f6fcc7.
diff --git a/phatk110722.cl b/phatk110722.cl
index 77379d1..43cdacf 100644
--- a/phatk110722.cl
+++ b/phatk110722.cl
@@ -394,26 +394,26 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
#if defined(VECTORS4) || defined(VECTORS2)
if (Vals[7].x == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & W_3.x] = W_3.x;
+ output[MAXBUFFERS] = output[NFLAG & (W[3].x >> 2)] = W_3.x;
}
if (Vals[7].y == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & W_3.y] = W_3.y;
+ output[MAXBUFFERS] = output[NFLAG & (W[3].y >> 2)] = W_3.y;
}
#ifdef VECTORS4
if (Vals[7].z == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & W_3.z] = W_3.z;
+ output[MAXBUFFERS] = output[NFLAG & (W[3].z >> 2)] = W_3.z;
}
if (Vals[7].w == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & W_3.w] = W_3.w;
+ output[MAXBUFFERS] = output[NFLAG & (W[3].w >> 2)] = W_3.w;
}
#endif
#else
if (Vals[7] == -H[7])
{
- output[MAXBUFFERS] = output[NFLAG & W_3] = W_3;
+ output[MAXBUFFERS] = output[NFLAG & (W[3] >> 2)] = W_3;
}
#endif
}