Edit

thodg/cgminer/x86_64/sha256_sse4_amd64.asm

Branch :

  • Show log

    Commit

  • Author : Luke Dashjr
    Date : 2012-02-21 18:58:19
    Hash : b16beb0d
    Message : Bugfix: allow no-exec (NX) stack

  • x86_64/sha256_sse4_amd64.asm
  • ;; SHA-256 for X86-64 for Linux, based off of:
    
    ; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
    ; Version 2011
    ; This software is Public Domain
    
    ; Significant re-write/optimisation and reordering by,
    ; Neil Kettle <mu-b@digit-labs.org>
    ; ~18% performance improvement
    
    ; SHA-256 CPU SSE cruncher for Bitcoin Miner
    
    ALIGN 32
    BITS 64
    
    %define hash rdi
    %define data rsi
    %define init rdx
    
    ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
    %define LAB_CALC_PARA	2
    %define LAB_CALC_UNROLL	8
    
    %define LAB_LOOP_UNROLL 8
    
    extern g_4sha256_k
    
    global CalcSha256_x64_sse4
    ;	CalcSha256	hash(rdi), data(rsi), init(rdx)
    CalcSha256_x64_sse4:
    
    	push	rbx
    
    LAB_NEXT_NONCE:
    
    	mov	rcx, 64*4					; 256 - rcx is # of SHA-2 rounds
    	mov	rax, 16*4					; 64 - rax is where we expand to
    
    LAB_SHA:
    	push	rcx
    	lea	rcx, qword [data+rcx*4]				; + 1024
    	lea	r11, qword [data+rax*4]				; + 256
    
    LAB_CALC:
    %macro	lab_calc_blk 1
    
    	movntdqa	xmm0, [r11-(15-%1)*16]				; xmm0 = W[I-15]
    	movdqa	xmm2, xmm0					; xmm2 = W[I-15]	
    	movntdqa	xmm4, [r11-(15-(%1+1))*16]			; xmm4 = W[I-15+1]
    	movdqa	xmm6, xmm4					; xmm6 = W[I-15+1]	
    
    	psrld	xmm0, 3						; xmm0 = W[I-15] >> 3
    	movdqa	xmm1, xmm0					; xmm1 = W[I-15] >> 3	
    	pslld	xmm2, 14					; xmm2 = W[I-15] << 14			
    	psrld	xmm4, 3						; xmm4 = W[I-15+1] >> 3
    	movdqa	xmm5, xmm4					; xmm5 = W[I-15+1] >> 3
    	psrld	xmm5, 4						; xmm5 = W[I-15+1] >> 7	
    	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)	
    	pslld	xmm6, 14					; xmm6 = W[I-15+1] << 14
    	psrld	xmm1, 4						; xmm1 = W[I-15] >> 7
    	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
    	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
    	psrld	xmm1, 11					; xmm1 = W[I-15] >> 18
    	psrld	xmm5, 11					; xmm5 = W[I-15+1] >> 18
    	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
    	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)	
    	pslld	xmm2, 11					; xmm2 = W[I-15] << 25
    	pslld	xmm6, 11					; xmm6 = W[I-15+1] << 25
    	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
    	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
    	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
    	paddd	xmm0, [r11-(16-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16]
    	paddd	xmm4, [r11-(16-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1]
    	movntdqa	xmm3, [r11-(2-%1)*16]				; xmm3 = W[I-2]
    	movntdqa	xmm7, [r11-(2-(%1+1))*16]			; xmm7 = W[I-2+1]
    
    ;;;;;;;;;;;;;;;;;;
    
    	movdqa	xmm2, xmm3					; xmm2 = W[I-2]
    	psrld	xmm3, 10					; xmm3 = W[I-2] >> 10
    	movdqa	xmm1, xmm3					; xmm1 = W[I-2] >> 10
    	movdqa	xmm6, xmm7					; xmm6 = W[I-2+1]
    	psrld	xmm7, 10					; xmm7 = W[I-2+1] >> 10
    	movdqa	xmm5, xmm7					; xmm5 = W[I-2+1] >> 10
    
    	paddd	xmm0, [r11-(7-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
    	paddd	xmm4, [r11-(7-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
    	
    	pslld	xmm2, 13					; xmm2 = W[I-2] << 13
    	pslld	xmm6, 13					; xmm6 = W[I-2+1] << 13
    	psrld	xmm1, 7						; xmm1 = W[I-2] >> 17
    	psrld	xmm5, 7						; xmm5 = W[I-2+1] >> 17
    
    
    
    	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
    	psrld	xmm1, 2						; xmm1 = W[I-2] >> 19
    	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
    	pslld	xmm2, 2						; xmm2 = W[I-2] << 15
    	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
    	psrld	xmm5, 2						; xmm5 = W[I-2+1] >> 19	
    	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
    	pslld	xmm6, 2						; xmm6 = W[I-2+1] << 15
    
    
    
    	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
    	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
    	paddd	xmm0, xmm3					; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
    	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)	
    	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
    	paddd	xmm4, xmm7					; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]
    
    	movdqa	[r11+(%1*16)], xmm0
    	movdqa	[r11+((%1+1)*16)], xmm4
    %endmacro
    
    %assign i 0
    %rep    LAB_CALC_UNROLL
            lab_calc_blk i
    %assign i i+LAB_CALC_PARA
    %endrep
    
    	add	r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
    	cmp	r11, rcx
    	jb	LAB_CALC
    
    	pop	rcx
    	mov	rax, 0
    
    ; Load the init values of the message into the hash.
    
    	movntdqa	xmm7, [init]
    	pshufd	xmm5, xmm7, 0x55		; xmm5 == b
    	pshufd	xmm4, xmm7, 0xAA		; xmm4 == c
    	pshufd	xmm3, xmm7, 0xFF		; xmm3 == d
    	pshufd	xmm7, xmm7, 0			; xmm7 == a
    
    	movntdqa	xmm0, [init+4*4]
    	pshufd	xmm8, xmm0, 0x55		; xmm8 == f
    	pshufd	xmm9, xmm0, 0xAA		; xmm9 == g
    	pshufd	xmm10, xmm0, 0xFF		; xmm10 == h
    	pshufd	xmm0, xmm0, 0			; xmm0 == e
    
    LAB_LOOP:
    
    ;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
    
    %macro	lab_loop_blk 0
    	movntdqa	xmm6, [data+rax*4]
    	paddd	xmm6, g_4sha256_k[rax*4]
    	add	rax, 4
    
    	paddd	xmm6, xmm10	; +h
    
    	movdqa	xmm1, xmm0
    	movdqa	xmm2, xmm9
    	pandn	xmm1, xmm2	; ~e & g
    
    	movdqa	xmm10, xmm2	; h = g
    	movdqa	xmm2, xmm8	; f
    	movdqa	xmm9, xmm2	; g = f
    
    	pand	xmm2, xmm0	; e & f
    	pxor	xmm1, xmm2	; (e & f) ^ (~e & g)
    	movdqa	xmm8, xmm0	; f = e
    
    	paddd	xmm6, xmm1	; Ch + h + w[i] + k[i]
    
    	movdqa	xmm1, xmm0
    	psrld	xmm0, 6
    	movdqa	xmm2, xmm0
    	pslld	xmm1, 7
    	psrld	xmm2, 5
    	pxor	xmm0, xmm1
    	pxor	xmm0, xmm2
    	pslld	xmm1, 14
    	psrld	xmm2, 14
    	pxor	xmm0, xmm1
    	pxor	xmm0, xmm2
    	pslld	xmm1, 5
    	pxor	xmm0, xmm1	; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
    	paddd	xmm6, xmm0	; xmm6 = t1
    
    	movdqa	xmm0, xmm3	; d
    	paddd	xmm0, xmm6	; e = d+t1
    
    	movdqa	xmm1, xmm5	; =b
    	movdqa	xmm3, xmm4	; d = c
    	movdqa	xmm2, xmm4	; c
    	pand	xmm2, xmm5	; b & c
    	pand	xmm4, xmm7	; a & c
    	pand	xmm1, xmm7	; a & b
    	pxor	xmm1, xmm4
    	movdqa	xmm4, xmm5	; c = b
    	movdqa	xmm5, xmm7	; b = a
    	pxor	xmm1, xmm2	; (a & c) ^ (a & d) ^ (c & d)
    	paddd	xmm6, xmm1	; t1 + ((a & c) ^ (a & d) ^ (c & d))
    
    	movdqa	xmm2, xmm7
    	psrld	xmm7, 2
    	movdqa	xmm1, xmm7
    	pslld	xmm2, 10
    	psrld	xmm1, 11
    	pxor	xmm7, xmm2
    	pxor	xmm7, xmm1
    	pslld	xmm2, 9
    	psrld	xmm1, 9
    	pxor	xmm7, xmm2
    	pxor	xmm7, xmm1
    	pslld	xmm2, 11
    	pxor	xmm7, xmm2
    	paddd	xmm7, xmm6	; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
    %endmacro
    
    %assign i 0
    %rep    LAB_LOOP_UNROLL
            lab_loop_blk
    %assign i i+1
    %endrep
    
    	cmp	rax, rcx
    	jb	LAB_LOOP
    
    ; Finished the 64 rounds, calculate hash and save
    
    	movntdqa	xmm1, [rdx]
    	pshufd	xmm2, xmm1, 0x55
    	paddd	xmm5, xmm2
    	pshufd	xmm6, xmm1, 0xAA
    	paddd	xmm4, xmm6
    	pshufd	xmm11, xmm1, 0xFF
    	paddd	xmm3, xmm11
    	pshufd	xmm1, xmm1, 0
    	paddd	xmm7, xmm1
    
    	movntdqa	xmm1, [rdx+4*4]
    	pshufd	xmm2, xmm1, 0x55
    	paddd	xmm8, xmm2
    	pshufd	xmm6, xmm1, 0xAA
    	paddd	xmm9, xmm6
    	pshufd	xmm11, xmm1, 0xFF
    	paddd	xmm10, xmm11
    	pshufd	xmm1, xmm1, 0
    	paddd	xmm0, xmm1
    
    	movdqa	[hash+0*16], xmm7
    	movdqa	[hash+1*16], xmm5
    	movdqa	[hash+2*16], xmm4
    	movdqa	[hash+3*16], xmm3
    	movdqa	[hash+4*16], xmm0
    	movdqa	[hash+5*16], xmm8
    	movdqa	[hash+6*16], xmm9
    	movdqa	[hash+7*16], xmm10
    
    LAB_RET:
    	pop	rbx
    	ret
    
    %ifidn __OUTPUT_FORMAT__,elf
    section .note.GNU-stack noalloc noexec nowrite progbits
    %endif
    %ifidn __OUTPUT_FORMAT__,elf64
    section .note.GNU-stack noalloc noexec nowrite progbits
    %endif