1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
;; SHA-256 for X86-64 for Linux, based off of:
; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain
; SHA-256 CPU SSE cruncher for Bitcoin Miner
ALIGN 32
BITS 64
%define hash rdi
%define data rsi
%define init rdx
extern g_4sha256_k
global CalcSha256_x64
; CalcSha256 hash(rdi), data(rsi), init(rdx)
CalcSha256_x64:
push rbx
LAB_NEXT_NONCE:
mov r11, data
; mov rax, pnonce
; mov eax, [rax]
; mov [rbx+3*16], eax
; inc eax
; mov [rbx+3*16+4], eax
; inc eax
; mov [rbx+3*16+8], eax
; inc eax
; mov [rbx+3*16+12], eax
mov rcx, 64*4 ;rcx is # of SHA-2 rounds
mov rax, 16*4 ;rax is where we expand to
LAB_SHA:
push rcx
lea rcx, qword [r11+rcx*4]
lea r11, qword [r11+rax*4]
LAB_CALC:
movdqa xmm0, [r11-15*16]
movdqa xmm2, xmm0 ; (Rotr32(w_15, 7) ^ Rotr32(w_15, 18) ^ (w_15 >> 3))
psrld xmm0, 3
movdqa xmm1, xmm0
pslld xmm2, 14
psrld xmm1, 4
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm2, 11
psrld xmm1, 11
pxor xmm0, xmm1
pxor xmm0, xmm2
paddd xmm0, [r11-16*16]
movdqa xmm3, [r11-2*16]
movdqa xmm2, xmm3 ; (Rotr32(w_2, 17) ^ Rotr32(w_2, 19) ^ (w_2 >> 10))
psrld xmm3, 10
movdqa xmm1, xmm3
pslld xmm2, 13
psrld xmm1, 7
pxor xmm3, xmm1
pxor xmm3, xmm2
pslld xmm2, 2
psrld xmm1, 2
pxor xmm3, xmm1
pxor xmm3, xmm2
paddd xmm0, xmm3
paddd xmm0, [r11-7*16]
movdqa [r11], xmm0
add r11, 16
cmp r11, rcx
jb LAB_CALC
pop rcx
mov rax, 0
; Load the init values of the message into the hash.
movd xmm0, dword [rdx+4*4] ; xmm0 == e
pshufd xmm0, xmm0, 0
movd xmm3, dword [rdx+3*4] ; xmm3 == d
pshufd xmm3, xmm3, 0
movd xmm4, dword [rdx+2*4] ; xmm4 == c
pshufd xmm4, xmm4, 0
movd xmm5, dword [rdx+1*4] ; xmm5 == b
pshufd xmm5, xmm5, 0
movd xmm7, dword [rdx+0*4] ; xmm7 == a
pshufd xmm7, xmm7, 0
movd xmm8, dword [rdx+5*4] ; xmm8 == f
pshufd xmm8, xmm8, 0
movd xmm9, dword [rdx+6*4] ; xmm9 == g
pshufd xmm9, xmm9, 0
movd xmm10, dword [rdx+7*4] ; xmm10 == h
pshufd xmm10, xmm10, 0
LAB_LOOP:
;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
movdqa xmm6, [rsi+rax*4]
paddd xmm6, g_4sha256_k[rax*4]
add rax, 4
paddd xmm6, xmm10 ; +h
movdqa xmm1, xmm0
movdqa xmm2, xmm9
pandn xmm1, xmm2 ; ~e & g
movdqa xmm10, xmm2 ; h = g
movdqa xmm2, xmm8 ; f
movdqa xmm9, xmm2 ; g = f
pand xmm2, xmm0 ; e & f
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
movdqa xmm8, xmm0 ; f = e
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]
movdqa xmm1, xmm0
psrld xmm0, 6
movdqa xmm2, xmm0
pslld xmm1, 7
psrld xmm2, 5
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
movdqa xmm0, xmm3 ; d
paddd xmm0, xmm6 ; e = d+t1
movdqa xmm1, xmm5 ; =b
movdqa xmm3, xmm4 ; d = c
movdqa xmm2, xmm4 ; c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))
movdqa xmm2, xmm7
psrld xmm7, 2
movdqa xmm1, xmm7
pslld xmm2, 10
psrld xmm1, 11
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
cmp rax, rcx
jb LAB_LOOP
; Finished the 64 rounds, calculate hash and save
movd xmm1, dword [rdx+0*4]
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1
movd xmm1, dword [rdx+1*4]
pshufd xmm1, xmm1, 0
paddd xmm5, xmm1
movd xmm1, dword [rdx+2*4]
pshufd xmm1, xmm1, 0
paddd xmm4, xmm1
movd xmm1, dword [rdx+3*4]
pshufd xmm1, xmm1, 0
paddd xmm3, xmm1
movd xmm1, dword [rdx+4*4]
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1
movd xmm1, dword [rdx+5*4]
pshufd xmm1, xmm1, 0
paddd xmm8, xmm1
movd xmm1, dword [rdx+6*4]
pshufd xmm1, xmm1, 0
paddd xmm9, xmm1
movd xmm1, dword [rdx+7*4]
pshufd xmm1, xmm1, 0
paddd xmm10, xmm1
debug_me:
movdqa [rdi+0*16], xmm7
movdqa [rdi+1*16], xmm5
movdqa [rdi+2*16], xmm4
movdqa [rdi+3*16], xmm3
movdqa [rdi+4*16], xmm0
movdqa [rdi+5*16], xmm8
movdqa [rdi+6*16], xmm9
movdqa [rdi+7*16], xmm10
LAB_RET:
pop rbx
ret