Micro-optimisation in sha256_sse2 code courtesy of Guido Ascioti guido.ascioti@gmail.com
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
diff --git a/sha256_sse2_i386.c b/sha256_sse2_i386.c
index ef3f0ee..72a90c9 100644
--- a/sha256_sse2_i386.c
+++ b/sha256_sse2_i386.c
@@ -67,12 +67,6 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
work_restart[thr_id].restart = 0;
- /* For debugging */
- union {
- __m128i m;
- uint32_t i[4];
- } mi;
-
/* Message expansion */
memcpy(m_midstate, pmidstate, sizeof(m_midstate));
memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */
@@ -102,17 +96,12 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init);
for (j = 0; j < 4; j++) {
- mi.m = m_4hash[7];
- if (unlikely(mi.i[j] == 0))
- break;
- }
-
- /* If j = true, we found a hit...so check it */
- /* Use the C version for a check... */
- if (unlikely(j != 4)) {
+ if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) {
+ /* We found a hit...so check it */
+ /* Use the C version for a check... */
+
for (i = 0; i < 8; i++) {
- mi.m = m_4hash[i];
- *(uint32_t *)&(phash)[i*4] = mi.i[j];
+ *(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j];
}
if (fulltest(phash, ptarget)) {
@@ -120,6 +109,7 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
*nNonce_p = nonce + j;
return nonce + j;
}
+ }
}
nonce += 4;
diff --git a/x86_32/sha256_xmm.asm b/x86_32/sha256_xmm.asm
index b2a8fbb..601cf2b 100644
--- a/x86_32/sha256_xmm.asm
+++ b/x86_32/sha256_xmm.asm
@@ -1,4 +1,4 @@
-;; SHA-256 for X86 for Linux, based off of:
+;; SHA-256 for X86 for Linux, based off of:A
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
@@ -15,30 +15,21 @@ BITS 32
; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
-%define LAB_CALC_UNROLL 8
+%define LAB_CALC_UNROLL 24
-%define LAB_LOOP_UNROLL 8
+%define LAB_LOOP_UNROLL 64
extern sha256_consts_m128i
global CalcSha256_x86
; CalcSha256 hash(ecx), data(edx), init([esp+4])
CalcSha256_x86:
- push esi
- push edi
- mov init, [esp+12]
-
- push ebx
-
-LAB_NEXT_NONCE:
-
- mov eax, 64*4 ; 256 - rcx is # of SHA-2 rounds
- mov ebx, 16*4 ; 64 - rax is where we expand to
+ push esi
+ push edi
+ mov init, [esp+12]
LAB_SHA:
- push eax
- lea eax, qword [data+eax*4] ; + 1024
- lea edi, qword [data+ebx*4] ; + 256
+ lea edi, qword [data+256] ; + 256
LAB_CALC:
%macro lab_calc_blk 1
@@ -116,13 +107,6 @@ LAB_CALC:
%assign i i+LAB_CALC_PARA
%endrep
- add edi, LAB_CALC_UNROLL*LAB_CALC_PARA*16
- cmp edi, eax
- jb LAB_CALC
-
- pop eax
- mov ebx, 0
-
; Load the init values of the message into the hash.
movdqa xmm7, [init]
@@ -143,14 +127,14 @@ LAB_CALC:
pshufd xmm0, xmm0, 0 ; xmm0 == e
+
LAB_LOOP:
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
-%macro lab_loop_blk 0
- movdqa xmm6, [data+ebx*4]
- paddd xmm6, sha256_consts_m128i[ebx*4]
- add ebx, 4
+%macro lab_loop_blk 1
+ movdqa xmm6, [data+%1]
+ paddd xmm6, sha256_consts_m128i[%1]
paddd xmm6, [hash+2*16] ; +h
@@ -217,68 +201,52 @@ LAB_LOOP:
%assign i 0
%rep LAB_LOOP_UNROLL
- lab_loop_blk
-%assign i i+1
+ lab_loop_blk i
+%assign i i+16
%endrep
- cmp ebx, eax
- jb LAB_LOOP
-
; Finished the 64 rounds, calculate hash and save
- movdqa xmm1, [init]
- pshufd xmm2, xmm1, 0x55
- pshufd xmm6, xmm1, 0xAA
- movdqa [hash+3*16], xmm6
- pshufd xmm6, xmm1, 0xFF
- movdqa [hash+4*16], xmm6
- pshufd xmm1, xmm1, 0
+ movdqa xmm1, [init+16]
- paddd xmm5, xmm2
- paddd xmm4, [hash+3*16]
- paddd xmm3, [hash+4*16]
- paddd xmm7, xmm1
-
- movdqa xmm1, [init+4*4]
- pshufd xmm2, xmm1, 0x55
- pshufd xmm6, xmm1, 0xAA
- movdqa [hash+3*16], xmm6
- pshufd xmm6, xmm1, 0xFF
- movdqa [hash+4*16], xmm6
- pshufd xmm1, xmm1, 0
+ pshufd xmm2, xmm1, 0xFF
+ movdqa xmm6, [hash+2*16]
+ paddd xmm2, xmm6
+ movdqa [hash+7*16], xmm2
- movdqa xmm6, [hash+0*16]
- paddd xmm2, xmm6
- movdqa [hash+0*16], xmm2
+ pshufd xmm2, xmm1, 0xAA
+ movdqa xmm6, [hash+1*16]
+ paddd xmm2, xmm6
+ movdqa [hash+6*16], xmm2
+ pshufd xmm2, xmm1, 0x55
+ movdqa xmm6, [hash+0*16]
+ paddd xmm2, xmm6
+ movdqa [hash+5*16], xmm2
- movdqa xmm2, [hash+3*16]
- movdqa xmm6, [hash+1*16]
- paddd xmm2, xmm6
- movdqa [hash+1*16], xmm2
+ pshufd xmm1, xmm1, 0
+ paddd xmm0, xmm1
+ movdqa [hash+4*16], xmm0
- movdqa xmm2, [hash+4*16]
- movdqa xmm6, [hash+2*16]
- paddd xmm2, xmm6
- movdqa [hash+2*16], xmm2
+ movdqa xmm1, [init]
- paddd xmm0, xmm1
+ pshufd xmm2, xmm1, 0xFF
+ paddd xmm3, xmm2
+ movdqa [hash+3*16], xmm3
- movdqa xmm1, [hash+0*16]
- movdqa xmm2, [hash+1*16]
- movdqa xmm6, [hash+2*16]
+ pshufd xmm2, xmm1, 0xAA
+ paddd xmm4, xmm2
+ movdqa [hash+2*16], xmm4
+
+ pshufd xmm2, xmm1, 0x55
+ paddd xmm5, xmm2
+ movdqa [hash+1*16], xmm5
+ pshufd xmm1, xmm1, 0
+ paddd xmm7, xmm1
movdqa [hash+0*16], xmm7
- movdqa [hash+1*16], xmm5
- movdqa [hash+2*16], xmm4
- movdqa [hash+3*16], xmm3
- movdqa [hash+4*16], xmm0
- movdqa [hash+5*16], xmm1
- movdqa [hash+6*16], xmm2
- movdqa [hash+7*16], xmm6
LAB_RET:
- pop ebx
- pop edi
- pop esi
- retn 4
+ pop edi
+ pop esi
+ retn 4