Commit d356f44d5389afaf2a9740b4b27c7d1fa7b8896d

ckolivas 2012-01-11T11:12:13

Micro-optimisation in sha256_sse2 code courtesy of Guido Ascioti guido.ascioti@gmail.com

diff --git a/sha256_sse2_i386.c b/sha256_sse2_i386.c
index ef3f0ee..72a90c9 100644
--- a/sha256_sse2_i386.c
+++ b/sha256_sse2_i386.c
@@ -67,12 +67,6 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
 
     work_restart[thr_id].restart = 0;
 
-    /* For debugging */
-    union {
-        __m128i m;
-        uint32_t i[4];
-    } mi;
-
     /* Message expansion */
     memcpy(m_midstate, pmidstate, sizeof(m_midstate));
     memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */
@@ -102,17 +96,12 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
 	CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init);
 
 	for (j = 0; j < 4; j++) {
-	    mi.m = m_4hash[7];
-	    if (unlikely(mi.i[j] == 0))
-		break;
-        }
-
-	/* If j = true, we found a hit...so check it */
-	/* Use the C version for a check... */
-	if (unlikely(j != 4)) {
+	    if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) {
+		/* We found a hit...so check it */
+		/* Use the C version for a check... */
+
 		for (i = 0; i < 8; i++) {
-		    mi.m = m_4hash[i];
-		    *(uint32_t *)&(phash)[i*4] = mi.i[j];
+		    *(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j];
 		}
 
 		if (fulltest(phash, ptarget)) {
@@ -120,6 +109,7 @@ int scanhash_sse2_32(int thr_id, const unsigned char *pmidstate,
 		     *nNonce_p = nonce + j;
 		     return nonce + j;
 		}
+	    }
 	}
 
 	nonce += 4;
diff --git a/x86_32/sha256_xmm.asm b/x86_32/sha256_xmm.asm
index b2a8fbb..601cf2b 100644
--- a/x86_32/sha256_xmm.asm
+++ b/x86_32/sha256_xmm.asm
@@ -1,4 +1,4 @@
-;; SHA-256 for X86 for Linux, based off of:
+;; SHA-256 for X86 for Linux, based off of:A
 
 ; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
 ; Version 2011
@@ -15,30 +15,21 @@ BITS 32
 
 ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
 %define LAB_CALC_PARA	2
-%define LAB_CALC_UNROLL	8
+%define LAB_CALC_UNROLL	24
 
-%define LAB_LOOP_UNROLL 8
+%define LAB_LOOP_UNROLL 64
 
 extern sha256_consts_m128i
 
 global CalcSha256_x86
 ;	CalcSha256	hash(ecx), data(edx), init([esp+4])
 CalcSha256_x86:
-                push	esi
-                push	edi
-                mov	init, [esp+12]
-
-	push	ebx
-
-LAB_NEXT_NONCE:
-
-	mov	eax, 64*4					; 256 - rcx is # of SHA-2 rounds
-	mov	ebx, 16*4					; 64 - rax is where we expand to
+	push	esi
+	push	edi
+	mov	init, [esp+12]
 
 LAB_SHA:
-	push	eax
-	lea	eax, qword [data+eax*4]				; + 1024
-	lea	edi, qword [data+ebx*4]				; + 256
+	lea	edi, qword [data+256]				; + 256
 
 LAB_CALC:
 %macro	lab_calc_blk 1
@@ -116,13 +107,6 @@ LAB_CALC:
 %assign i i+LAB_CALC_PARA
 %endrep
 
-	add	edi, LAB_CALC_UNROLL*LAB_CALC_PARA*16
-	cmp	edi, eax
-	jb	LAB_CALC
-
-	pop	eax
-	mov	ebx, 0
-
 ; Load the init values of the message into the hash.
 
 	movdqa	xmm7, [init]
@@ -143,14 +127,14 @@ LAB_CALC:
 
 	pshufd	xmm0, xmm0, 0			; xmm0 == e
 
+
 LAB_LOOP:
 
 ;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
 
-%macro	lab_loop_blk 0
-	movdqa	xmm6, [data+ebx*4]
-	paddd	xmm6, sha256_consts_m128i[ebx*4]
-	add	ebx, 4
+%macro	lab_loop_blk 1
+	movdqa	xmm6, [data+%1]
+	paddd	xmm6, sha256_consts_m128i[%1]
 
 	paddd	xmm6, [hash+2*16]		; +h
 
@@ -217,68 +201,52 @@ LAB_LOOP:
 
 %assign i 0
 %rep    LAB_LOOP_UNROLL
-        lab_loop_blk
-%assign i i+1
+        lab_loop_blk i
+%assign i i+16
 %endrep
 
-	cmp	ebx, eax
-	jb	LAB_LOOP
-
 ; Finished the 64 rounds, calculate hash and save
 
-	movdqa	xmm1, [init]
-	pshufd	xmm2, xmm1, 0x55
-	pshufd	xmm6, xmm1, 0xAA
-	movdqa	[hash+3*16], xmm6
-	pshufd	xmm6, xmm1, 0xFF
-	movdqa	[hash+4*16], xmm6
-	pshufd	xmm1, xmm1, 0
+	movdqa	xmm1, [init+16]
 
-	paddd	xmm5, xmm2
-	paddd	xmm4, [hash+3*16]
-	paddd	xmm3, [hash+4*16]
-	paddd	xmm7, xmm1
-
-	movdqa	xmm1, [init+4*4]
-	pshufd	xmm2, xmm1, 0x55
-	pshufd	xmm6, xmm1, 0xAA
-	movdqa	[hash+3*16], xmm6
-	pshufd	xmm6, xmm1, 0xFF
-	movdqa	[hash+4*16], xmm6
-	pshufd	xmm1, xmm1, 0
+	pshufd	xmm2, xmm1, 0xFF
+	movdqa  xmm6, [hash+2*16]
+	paddd   xmm2, xmm6
+	movdqa  [hash+7*16], xmm2
 
-	movdqa	xmm6, [hash+0*16]
-	paddd	xmm2, xmm6
-	movdqa	[hash+0*16], xmm2
+	pshufd	xmm2, xmm1, 0xAA
+	movdqa  xmm6, [hash+1*16]
+	paddd   xmm2, xmm6
+	movdqa  [hash+6*16], xmm2
 
+	pshufd  xmm2, xmm1, 0x55
+	movdqa  xmm6, [hash+0*16]
+	paddd   xmm2, xmm6
+	movdqa  [hash+5*16], xmm2
 
-	movdqa	xmm2, [hash+3*16]
-	movdqa	xmm6, [hash+1*16]
-	paddd	xmm2, xmm6
-	movdqa	[hash+1*16], xmm2
+	pshufd	xmm1, xmm1, 0
+	paddd	xmm0, xmm1
+	movdqa  [hash+4*16], xmm0
 
-	movdqa	xmm2, [hash+4*16]
-	movdqa	xmm6, [hash+2*16]
-	paddd	xmm2, xmm6
-	movdqa	[hash+2*16], xmm2
+	movdqa  xmm1, [init]
 
-	paddd	xmm0, xmm1
+	pshufd  xmm2, xmm1, 0xFF
+	paddd   xmm3, xmm2
+	movdqa  [hash+3*16], xmm3
 
-	movdqa	xmm1, [hash+0*16]
-	movdqa	xmm2, [hash+1*16]
-	movdqa	xmm6, [hash+2*16]
+	pshufd  xmm2, xmm1, 0xAA
+	paddd   xmm4, xmm2
+	movdqa  [hash+2*16], xmm4
+
+        pshufd  xmm2, xmm1, 0x55
+        paddd   xmm5, xmm2
+        movdqa  [hash+1*16], xmm5
 
+	pshufd  xmm1, xmm1, 0
+	paddd   xmm7, xmm1
 	movdqa	[hash+0*16], xmm7
-	movdqa	[hash+1*16], xmm5
-	movdqa	[hash+2*16], xmm4
-	movdqa	[hash+3*16], xmm3
-	movdqa	[hash+4*16], xmm0
-	movdqa	[hash+5*16], xmm1
-	movdqa	[hash+6*16], xmm2
-	movdqa	[hash+7*16], xmm6
 
 LAB_RET:
-	pop	ebx
-                pop	edi
-                pop	esi
-                retn	4
+	pop	edi
+	pop	esi
+	retn	4