Hash :
6abd3916
Author :
Date :
2016-11-15T08:47:43
Unified CMake-based build system See #56 for discussion. Fixes #21, Fixes #29, Fixes #37, Closes #56, Fixes #58, Closes #73 Obviates #82 See also: https://sourceforge.net/p/libjpeg-turbo/feature-requests/5/ https://sourceforge.net/p/libjpeg-turbo/patches/5/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
;
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
;
; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation for Huffman coding of one block.
; The following code is based directly on jchuff.c; see jchuff.c for more
; details.
;
; [TAB8]
%include "jsimdext.inc"
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
global EXTN(jconst_huff_encode_one_block)
EXTN(jconst_huff_encode_one_block):
%include "jpeg_nbits_table.inc"
alignz 32
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 64
; These macros perform the same task as the emit_bits() function in the
; original libjpeg code. In addition to reducing overhead by explicitly
; inlining the code, additional performance is achieved by taking into
; account the size of the bit buffer and waiting until it is almost full
; before emptying it. This mostly benefits 64-bit platforms, since 6
; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
%macro EMIT_BYTE 0
sub put_bits, 8 ; put_bits -= 8;
mov rdx, put_buffer
mov ecx, put_bits
shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
mov byte [buffer], dl ; *buffer++ = c;
add buffer, 1
cmp dl, 0xFF ; need to stuff a zero byte?
jne %%.EMIT_BYTE_END
mov byte [buffer], 0 ; *buffer++ = 0;
add buffer, 1
%%.EMIT_BYTE_END:
%endmacro
%macro PUT_BITS 1
add put_bits, ecx ; put_bits += size;
shl put_buffer, cl ; put_buffer = (put_buffer << size);
or put_buffer, %1
%endmacro
%macro CHECKBUF31 0
cmp put_bits, 32 ; if (put_bits > 31) {
jl %%.CHECKBUF31_END
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
%%.CHECKBUF31_END:
%endmacro
%macro CHECKBUF47 0
cmp put_bits, 48 ; if (put_bits > 47) {
jl %%.CHECKBUF47_END
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
EMIT_BYTE
%%.CHECKBUF47_END:
%endmacro
%macro EMIT_BITS 2
CHECKBUF47
mov ecx, %2
PUT_BITS %1
%endmacro
%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
%if %1 != 32
pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
%else
pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
%endif
pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro
;
; Encode a single block's worth of coefficients.
;
; GLOBAL(JOCTET*)
; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
; JCOEFPTR block, int last_dc_val,
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
;
; r10 = working_state *state
; r11 = JOCTET *buffer
; r12 = JCOEFPTR block
; r13d = int last_dc_val
; r14 = c_derived_tbl *dctbl
; r15 = c_derived_tbl *actbl
%define t1 rbp-(DCTSIZE2*SIZEOF_WORD)
%define t2 t1-(DCTSIZE2*SIZEOF_WORD)
%define put_buffer r8
%define put_bits r9d
%define buffer rax
align 32
global EXTN(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
push rbp
mov rax, rsp ; rax = original rbp
sub rsp, byte 4
and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2]
push_xmm 4
collect_args 6
push rbx
mov buffer, r11 ; r11 is now sratch
mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
push r10 ; r10 is now scratch
; Encode the DC coefficient difference per section F.1.2.1
movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
sub edi, r13d ; r13 is not used anymore
mov ebx, edi
; This is a well-known technique for obtaining the absolute value
; without a branch. It is derived from an assembly language technique
; presented in "How to Optimize for the Pentium Processors",
; Copyright (c) 1996, 1997 by Agner Fog.
mov esi, edi
sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
xor edi, esi ; temp ^= temp3;
sub edi, esi ; temp -= temp3;
; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine
add ebx, esi ; temp2 += temp3;
; Find the number of bits needed for the magnitude of the coefficient
lea r11, [rel jpeg_nbits_table]
movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
; Emit the Huffman-coded symbol for the number of bits
mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS r11, esi ; EMIT_BITS(code, size)
; Mask off any extra bits in code
mov esi, 1
mov ecx, edi
shl esi, cl
dec esi
and ebx, esi ; temp2 &= (((JLONG) 1)<<nbits) - 1;
; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative.
EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
; Prepare data
xor ebx, ebx
kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
27, 20, 13, 6, 7, 14, 21, 28, 35, \
xmm0, xmm1, xmm2, xmm3
kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
53, 60, 61, 54, 47, 55, 62, 63, 63, \
xmm4, xmm5, xmm6, xmm7
pxor xmm8, xmm8
pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
shl r12, 16
shl r14, 16
or r11, r12
or r13, r14
shl r13, 32
or r11, r13
not r11 ; index = ~index;
;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
;jmp .EFN
mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
lea rsi, [t1]
.BLOOP:
bsf r12, r11 ; r = __builtin_ctzl(index);
jz .ELOOP
mov rcx, r12
lea rsi, [rsi+r12*2] ; k += r;
shr r11, cl ; index >>= r;
movzx rdi, word [rsi] ; temp = t1[k];
lea rbx, [rel jpeg_nbits_table]
movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
.BRLOOP:
cmp r12, 16 ; while (r > 15) {
jl .ERLOOP
EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
sub r12, 16 ; r -= 16;
jmp .BRLOOP
.ERLOOP:
; Emit Huffman symbol for run length / number of bits
CHECKBUF31 ; uses rcx, rdx
shl r12, 4 ; temp3 = (r << 4) + nbits;
add r12, rdi
mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
PUT_BITS rbx
;EMIT_CODE(code, size)
movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
mov rcx, rdi
mov rdx, 1
shl rdx, cl
dec rdx
and rbx, rdx ; temp2 &= (((JLONG) 1)<<nbits) - 1;
PUT_BITS rbx ; PUT_BITS(temp2, nbits)
shr r11, 1 ; index >>= 1;
add rsi, 2 ; ++k;
jmp .BLOOP
.ELOOP:
; If the last coef(s) were zero, emit an end-of-block code
lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
cmp rdi, rsi ; if (r > 0) {
je .EFN
mov ebx, INT [r15] ; code = actbl->ehufco[0];
movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
EMIT_BITS rbx, r12d
.EFN:
pop r10
; Save put_buffer & put_bits
mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
pop rbx
uncollect_args 6
pop_xmm 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
ret
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32