Base was being set wrongly meaning we were repeating searches and the rate was actually lower than displayed :( Tweak Ma with new changes. Change default vectors to 2 since it's faster than 4 even when 4 is reported as preferred.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
diff --git a/ocl.c b/ocl.c
index e858aad..226d457 100644
--- a/ocl.c
+++ b/ocl.c
@@ -324,6 +324,10 @@ retry:
}
memcpy(source, rawsource, pl);
+ /* For some reason 2 vectors is still better even if the card says
+ * otherwise */
+ if (clState->preferred_vwidth > 1)
+ clState->preferred_vwidth = 2;
if (opt_vectors)
clState->preferred_vwidth = opt_vectors;
if (opt_worksize && opt_worksize <= clState->max_work_size)
diff --git a/poclbm.cl b/poclbm.cl
index 4ca7d2c..4b959cf 100644
--- a/poclbm.cl
+++ b/poclbm.cl
@@ -1,3 +1,6 @@
+// -ck modified kernel taken from Phoenix taken from poclbm, with aspects of
+// phatk and others.
+
// This file is taken and modified from the public-domain poclbm project, and
// we have therefore decided to keep it public-domain in Phoenix.
@@ -47,7 +50,7 @@ __constant uint K[64] = {
#define ch(x, y, z) amd_bytealign(x, y, z)
// Ma can also be implemented in terms of BFI_INT...
- #define Ma(x, y, z) amd_bytealign((y), (x | z), (z & x))
+ #define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) )
#else
#define ch(x, y, z) (z ^ (x & (y ^ z)))
#define Ma(x, y, z) ((x & z) | (y & (x | z)))
@@ -76,12 +79,12 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
u W[24];
u Vals[8];
u nonce;
- uint it;
+ u it;
#ifdef VECTORS4
- nonce = ((base >> 2) + (get_global_id(0))<<2) + (uint4)(0, 1, 2, 3);
+ nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3);
#elif defined VECTORS2
- nonce = ((base >> 1) + (get_global_id(0))<<1) + (uint2)(0, 1);
+ nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1);
#else
nonce = base + get_global_id(0);
#endif
@@ -627,9 +630,9 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
#if defined(VECTORS4) || defined(VECTORS2)
if (Vals[7].x == 0)
{
- for (it = 0; it != 127; it++) {
- if (!output[it]) {
- output[it] = nonce.x;
+ for (it.x = 0; it.x != 127; it.x++) {
+ if (!output[it.x]) {
+ output[it.x] = nonce.x;
output[127] = 1;
break;
}
@@ -637,9 +640,9 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
}
if (Vals[7].y == 0)
{
- for (it = 0; it != 127; it++) {
- if (!output[it]) {
- output[it] = nonce.y;
+ for (it.y = 0; it.y != 127; it.y++) {
+ if (!output[it.y]) {
+ output[it.y] = nonce.y;
output[127] = 1;
break;
}
@@ -648,9 +651,9 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
#ifdef VECTORS4
if (Vals[7].z == 0)
{
- for (it = 0; it != 127; it++) {
- if (!output[it]) {
- output[it] = nonce.z;
+ for (it.z = 0; it.z != 127; it.z++) {
+ if (!output[it.z]) {
+ output[it.z] = nonce.z;
output[127] = 1;
break;
}
@@ -658,9 +661,9 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
}
if (Vals[7].w == 0)
{
- for (it = 0; it != 127; it++) {
- if (!output[it]) {
- output[it] = nonce.w;
+ for (it.w = 0; it.w != 127; it.w++) {
+ if (!output[it.w]) {
+ output[it.w] = nonce.w;
output[127] = 1;
break;
}