Commit dc2d553d5bad301ee553a1276abed4b9635b4d2c

Con Kolivas 2012-02-14T21:10:47

Hand optimise first variable declaration order in poclbm kernel.

diff --git a/poclbm120213.cl b/poclbm120213.cl
index f2455c1..3ab6f63 100644
--- a/poclbm120213.cl
+++ b/poclbm120213.cl
@@ -88,31 +88,38 @@ __kernel void search(const uint state0, const uint state1, const uint state2, co
 
 W[20]=fcty_e;
 W[20]+=nonce;
-W[16]=state0;
-W[16]+=W[20];
-W[19]=d1;
-W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25));
+
+W[16]=W[20];
+W[16]+=state0;
+
+W[19]=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25));
+W[19]+=d1;
 W[19]+=ch(W[16],b1,c1);
 W[19]+=K[4];
-W[23]=h1;
 W[19]+=0x80000000;
-W[23]+=W[19];
+
+W[23]=W[19];
+W[23]+=h1;
 W[20]+=fcty_e2;
 W[19]+=(rotr(W[20],2)^rotr(W[20],13)^rotr(W[20],22));
+
 W[18]=c1;
 W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25));
 W[18]+=ch(W[23],W[16],b1);
 W[18]+=K[5];
-W[22]=g1;
-W[22]+=W[18];
+
+W[22]=W[18];
+W[22]+=g1;
 W[19]+=Ma2(g1,W[20],f1);
 W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22));
+
 W[17]=b1;
 W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25));
 W[17]+=ch(W[22],W[23],W[16]);
 W[17]+=K[6];
-W[21]=f1;
-W[21]+=W[17];
+
+W[21]=W[17];
+W[21]+=f1;
 W[18]+=Ma2(f1,W[19],W[20]);
 W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22));
 W[16]+=(rotr(W[21],6)^rotr(W[21],11)^rotr(W[21],25));
@@ -184,6 +191,7 @@ W[22]+=K[17];
 W[22]+=fw1;
 W[18]+=W[22];
 W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22));
+
 W[2]=(rotr(nonce,7)^rotr(nonce,18)^(nonce>>3U));
 W[2]+=fw2;
 W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25));
@@ -194,6 +202,7 @@ W[22]+=Ma(W[17],W[23],W[16]);
 W[17]+=W[21];
 W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22));
 W[21]+=Ma(W[16],W[22],W[23]);
+
 W[3]=nonce;
 W[3]+=fw3;
 W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25));
@@ -202,6 +211,7 @@ W[20]+=K[19];
 W[20]+=W[3];
 W[16]+=W[20];
 W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22));
+
 W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
 W[4]+=0x80000000;
 W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25));
@@ -215,10 +225,12 @@ W[19]+=Ma(W[22],W[20],W[21]);
 W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25));
 W[18]+=ch(W[23],W[16],W[17]);
 W[18]+=K[21];
+
 W[5]=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
 W[18]+=W[5];
 W[22]+=W[18];
 W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22));
+
 W[6]=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
 W[6]+=0x00000280U;
 W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25));
@@ -229,6 +241,7 @@ W[18]+=Ma(W[21],W[19],W[20]);
 W[21]+=W[17];
 W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22));
 W[17]+=Ma(W[20],W[18],W[19]);
+
 W[7]=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
 W[7]+=fw0;
 W[16]+=(rotr(W[21],6)^rotr(W[21],11)^rotr(W[21],25));
@@ -237,6 +250,7 @@ W[16]+=K[23];
 W[16]+=W[7];
 W[20]+=W[16];
 W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22));
+
 W[8]=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
 W[8]+=fw1;
 W[23]+=(rotr(W[20],6)^rotr(W[20],11)^rotr(W[20],25));
@@ -247,6 +261,7 @@ W[16]+=Ma(W[19],W[17],W[18]);
 W[19]+=W[23];
 W[23]+=(rotr(W[16],2)^rotr(W[16],13)^rotr(W[16],22));
 W[23]+=Ma(W[18],W[16],W[17]);
+
 W[9]=W[2];
 W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
 W[22]+=(rotr(W[19],6)^rotr(W[19],11)^rotr(W[19],25));
@@ -255,6 +270,7 @@ W[22]+=K[25];
 W[22]+=W[9];
 W[18]+=W[22];
 W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22));
+
 W[10]=W[3];
 W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
 W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25));
@@ -265,6 +281,7 @@ W[22]+=Ma(W[17],W[23],W[16]);
 W[17]+=W[21];
 W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22));
 W[21]+=Ma(W[16],W[22],W[23]);
+
 W[11]=W[4];
 W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
 W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25));
@@ -273,6 +290,7 @@ W[20]+=K[27];
 W[20]+=W[11];
 W[16]+=W[20];
 W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22));
+
 W[12]=W[5];
 W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
 W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25));
@@ -283,6 +301,7 @@ W[20]+=Ma(W[23],W[21],W[22]);
 W[23]+=W[19];
 W[19]+=(rotr(W[20],2)^rotr(W[20],13)^rotr(W[20],22));
 W[19]+=Ma(W[22],W[20],W[21]);
+
 W[13]=W[6];
 W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
 W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25));
@@ -291,6 +310,7 @@ W[18]+=K[29];
 W[18]+=W[13];
 W[22]+=W[18];
 W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22));
+
 W[14]=0x00a00055U;
 W[14]+=W[7];
 W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25));
@@ -302,6 +322,7 @@ W[18]+=Ma(W[21],W[19],W[20]);
 W[21]+=W[17];
 W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22));
 W[17]+=Ma(W[20],W[18],W[19]);
+
 W[15]=fw15;
 W[15]+=W[8];
 W[16]+=(rotr(W[21],6)^rotr(W[21],11)^rotr(W[21],25));
@@ -311,6 +332,7 @@ W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
 W[16]+=W[15];
 W[20]+=W[16];
 W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22));
+
 W[0]=fw01r;
 W[0]+=W[9];
 W[23]+=(rotr(W[20],6)^rotr(W[20],11)^rotr(W[20],25));
@@ -322,6 +344,7 @@ W[16]+=Ma(W[19],W[17],W[18]);
 W[19]+=W[23];
 W[23]+=(rotr(W[16],2)^rotr(W[16],13)^rotr(W[16],22));
 W[23]+=Ma(W[18],W[16],W[17]);
+
 W[1]=fw1;
 W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
 W[1]+=W[10];
@@ -633,52 +656,68 @@ W[16]+=W[15];
 W[20]+=W[16];
 W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22));
 W[16]+=Ma(W[19],W[17],W[18]);
+
 W[0]=W[16];
-W[7]=W[23];
-W[7]+=state7;
+
+W[7]=state7;
+W[7]+=W[23];
+
 W[23]=0xb0edbdd0;
 W[23]+=K[0];
 W[0]+=state0;
 W[23]+=W[0];
-W[3]=W[19];
-W[3]+=state3;
+
+W[3]=state3;
+W[3]+=W[19];
+
 W[19]=0xa54ff53a;
 W[19]+=W[23];
+
 W[1]=W[17];
 W[1]+=state1;
-W[6]=W[22];
-W[6]+=state6;
+
+W[6]=state6;
+W[6]+=W[22];
+
 W[22]=0x1f83d9abU;
 W[22]+=(rotr(W[19],6)^rotr(W[19],11)^rotr(W[19],25));
 W[22]+=(0x9b05688cU^(W[19]&0xca0b3af3U));
 W[22]+=K[1];
-W[2]=W[18];
-W[2]+=state2;
-W[22]+=W[1];
+
+W[2]=state2;
+W[2]+=W[18];
+
 W[18]=0x3c6ef372U;
+W[22]+=W[1];
 W[18]+=W[22];
 W[23]+=0x08909ae5U;
 W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22));
-W[5]=W[21];
-W[5]+=state5;
+
+W[5]=state5;
+W[5]+=W[21];
+
 W[21]=0x9b05688cU;
 W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25));
 W[21]+=ch(W[18],W[19],0x510e527fU);
 W[21]+=K[2];
 W[21]+=W[2];
+
 W[17]=0xbb67ae85U;
 W[17]+=W[21];
 W[22]+=Ma2(0xbb67ae85U,W[23],0x6a09e667U);
 W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22));
-W[4]=W[20];
-W[4]+=state4;
+
+W[4]=state4;
+W[4]+=W[20];
+
 W[20]=0x510e527fU;
 W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25));
 W[20]+=ch(W[17],W[18],W[19]);
 W[20]+=K[3];
 W[20]+=W[3];
-W[16]=0x6a09e667U;
-W[16]+=W[20];
+
+W[16]=W[20];
+W[16]+=0x6a09e667U;
 W[21]+=Ma2(0x6a09e667U,W[22],W[23]);
 W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22));
 W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25));
@@ -832,6 +871,7 @@ W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
 W[16]+=W[7];
 W[20]+=W[16];
 W[16]+=(rotr(W[17],2)^rotr(W[17],13)^rotr(W[17],22));
+
 W[8]=0x80000000;
 W[8]+=W[1];
 W[23]+=(rotr(W[20],6)^rotr(W[20],11)^rotr(W[20],25));
@@ -843,6 +883,7 @@ W[16]+=Ma(W[19],W[17],W[18]);
 W[19]+=W[23];
 W[23]+=(rotr(W[16],2)^rotr(W[16],13)^rotr(W[16],22));
 W[23]+=Ma(W[18],W[16],W[17]);
+
 W[9]=W[2];
 W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
 W[22]+=(rotr(W[19],6)^rotr(W[19],11)^rotr(W[19],25));
@@ -851,6 +892,7 @@ W[22]+=K[25];
 W[22]+=W[9];
 W[18]+=W[22];
 W[22]+=(rotr(W[23],2)^rotr(W[23],13)^rotr(W[23],22));
+
 W[10]=W[3];
 W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
 W[21]+=(rotr(W[18],6)^rotr(W[18],11)^rotr(W[18],25));
@@ -861,6 +903,7 @@ W[22]+=Ma(W[17],W[23],W[16]);
 W[17]+=W[21];
 W[21]+=(rotr(W[22],2)^rotr(W[22],13)^rotr(W[22],22));
 W[21]+=Ma(W[16],W[22],W[23]);
+
 W[11]=W[4];
 W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
 W[20]+=(rotr(W[17],6)^rotr(W[17],11)^rotr(W[17],25));
@@ -869,6 +912,7 @@ W[20]+=K[27];
 W[20]+=W[11];
 W[16]+=W[20];
 W[20]+=(rotr(W[21],2)^rotr(W[21],13)^rotr(W[21],22));
+
 W[12]=W[5];
 W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
 W[19]+=(rotr(W[16],6)^rotr(W[16],11)^rotr(W[16],25));
@@ -879,6 +923,7 @@ W[20]+=Ma(W[23],W[21],W[22]);
 W[23]+=W[19];
 W[19]+=(rotr(W[20],2)^rotr(W[20],13)^rotr(W[20],22));
 W[19]+=Ma(W[22],W[20],W[21]);
+
 W[13]=W[6];
 W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
 W[18]+=(rotr(W[23],6)^rotr(W[23],11)^rotr(W[23],25));
@@ -887,6 +932,7 @@ W[18]+=K[29];
 W[18]+=W[13];
 W[22]+=W[18];
 W[18]+=(rotr(W[19],2)^rotr(W[19],13)^rotr(W[19],22));
+
 W[14]=0x00400022U;
 W[14]+=W[7];
 W[17]+=(rotr(W[22],6)^rotr(W[22],11)^rotr(W[22],25));
@@ -898,6 +944,7 @@ W[18]+=Ma(W[21],W[19],W[20]);
 W[21]+=W[17];
 W[17]+=(rotr(W[18],2)^rotr(W[18],13)^rotr(W[18],22));
 W[17]+=Ma(W[20],W[18],W[19]);
+
 W[15]=0x00000100U;
 W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
 W[15]+=W[8];