Commit dd5bd03075149d7cf8441875c1a344e8beb57dde

Martin Storsjö 2021-04-07T05:42:10

Fix building for arm windows with mingw toolchains (#631) * arm: Check _WIN32 instead of _M_ARM or _MSC_VER for detecting windows This matches what was done for ARM64 in c06468fa6674d3783a0edb1d0fae9afc8bc28513. * arm: Only use armasm source when building with MSVC When building for windows/arm with clang, the normal gas style .S source works fine (if fixed up to support thumb and other windows specifics). This matches what was done for ARM64 in c06468fa6674d3783a0edb1d0fae9afc8bc28513. * arm: Fix sysv.S to work in thumb mode Align cases in jump tables (adding nop padding to make sure each case starts where expected). Rewrite instructions that add directly to the pc register. For ffi_closure_ret, factor out a call_epilogue subroutine that restores both sp and pc from the stack; the thumb version of ldm can't load into the sp register. To avoid excessive ifdeffing, keep using call_epilogue in arm mode, but keep the shorter "ldm sp, {sp, pc}" epilogue in that case. * arm: Add win32 version of trampoline to sysv.S This matches the version of it in sysv_msvc_arm32.S. The calling C code expects a specific form of the trampoline on windows; make sure these work the same on windows regardless of the form of assembly used. * arm: Avoid optimizing out clearing the thumb bit of ffi_arm_trampoline We clear the thumb bit of ffi_arm_trampoline with a bitmask before memcpying its instructions into closure->tramp. If the bit isn't cleared, the memcpy of the trampoline function copies the wrong instructions. If the ffi_arm_trampoline symbol is declared as an array of int, the compiler can assume that it is aligned to a 4 byte boundary and the bitmask operation is a no-op, and optimize it out. See https://godbolt.org/z/dE3jE1WTz; both Clang and GCC optimize out the bitmask as it is, while MSVC doesn't. By declaring the trampoline as an array of unsigned char, the bitmask works as intended.

diff --git a/configure.host b/configure.host
index 257b784..2682671 100644
--- a/configure.host
+++ b/configure.host
@@ -32,7 +32,9 @@ case "${host}" in
 
   arm*-*-cygwin* | arm*-*-mingw* | arm*-*-win* )
 	TARGET=ARM_WIN32; TARGETDIR=arm
-	MSVC=1
+	if test "${ax_cv_c_compiler_vendor}" = "microsoft"; then
+	  MSVC=1
+	fi
 	;;
 
   arm*-*-*)
@@ -264,7 +266,11 @@ esac
 # ... but some of the cases above share configury.
 case "${TARGET}" in
   ARM_WIN32)
-	SOURCES="ffi.c sysv_msvc_arm32.S"
+	if test "$MSVC" = 1; then
+		SOURCES="ffi.c sysv_msvc_arm32.S"
+	else
+		SOURCES="ffi.c sysv.S"
+	fi
 	;;
   ARM_WIN64)
 	if test "$MSVC" = 1; then
diff --git a/src/arm/ffi.c b/src/arm/ffi.c
index b2f60d1..593ab4d 100644
--- a/src/arm/ffi.c
+++ b/src/arm/ffi.c
@@ -37,7 +37,7 @@
 #include <tramp.h>
 #include "internal.h"
 
-#if defined(_MSC_VER) && defined(_M_ARM)
+#if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #endif
@@ -49,10 +49,13 @@
 #endif
 
 #else
-#ifndef _M_ARM
+#ifndef _WIN32
 extern unsigned int ffi_arm_trampoline[2] FFI_HIDDEN;
 #else
-extern unsigned int ffi_arm_trampoline[3] FFI_HIDDEN;
+// Declare this as an array of char, instead of array of int,
+// otherwise Clang optimizes out the "& 0xFFFFFFFE" for clearing
+// the thumb bit.
+extern unsigned char ffi_arm_trampoline[12] FFI_HIDDEN;
 #endif
 #endif
 
@@ -104,13 +107,13 @@ ffi_put_arg (ffi_type *ty, void *src, void *dst)
     case FFI_TYPE_SINT32:
     case FFI_TYPE_UINT32:
     case FFI_TYPE_POINTER:
-#ifndef _MSC_VER
+#ifndef _WIN32
     case FFI_TYPE_FLOAT:
 #endif
       *(UINT32 *)dst = *(UINT32 *)src;
       break;
 
-#ifdef _MSC_VER
+#ifdef _WIN32
     // casting a float* to a UINT32* doesn't work on Windows
     case FFI_TYPE_FLOAT:
         *(uintptr_t *)dst = 0;
@@ -633,7 +636,7 @@ ffi_prep_closure_loc (ffi_closure * closure,
 #endif
 
   /* Initialize the dynamic trampoline. */
-#ifndef _M_ARM
+#ifndef _WIN32
   memcpy(closure->tramp, ffi_arm_trampoline, 8);
 #else
   // cast away function type so MSVC doesn't set the lower bit of the function pointer
@@ -643,13 +646,13 @@ ffi_prep_closure_loc (ffi_closure * closure,
 #if defined (__QNX__)
   msync(closure->tramp, 8, 0x1000000);	/* clear data map */
   msync(codeloc, 8, 0x1000000);	/* clear insn map */
-#elif defined(_MSC_VER)
+#elif defined(_WIN32)
   FlushInstructionCache(GetCurrentProcess(), closure->tramp, FFI_TRAMPOLINE_SIZE);
 #else
   __clear_cache(closure->tramp, closure->tramp + 8);	/* clear data map */
   __clear_cache(codeloc, codeloc + 8);			/* clear insn map */
 #endif
-#ifdef _M_ARM
+#ifdef _WIN32
   *(void(**)(void))(closure->tramp + FFI_TRAMPOLINE_CLOSURE_FUNCTION) = closure_func;
 #else
   *(void (**)(void))(closure->tramp + 8) = closure_func;
diff --git a/src/arm/ffitarget.h b/src/arm/ffitarget.h
index cb57b84..12d5d20 100644
--- a/src/arm/ffitarget.h
+++ b/src/arm/ffitarget.h
@@ -43,7 +43,7 @@ typedef enum ffi_abi {
   FFI_SYSV,
   FFI_VFP,
   FFI_LAST_ABI,
-#if defined(__ARM_PCS_VFP) || defined(_M_ARM)
+#if defined(__ARM_PCS_VFP) || defined(_WIN32)
   FFI_DEFAULT_ABI = FFI_VFP,
 #else
   FFI_DEFAULT_ABI = FFI_SYSV,
@@ -57,7 +57,7 @@ typedef enum ffi_abi {
   signed char vfp_args[16]			\
 
 #define FFI_TARGET_SPECIFIC_VARIADIC
-#ifndef _M_ARM
+#ifndef _WIN32
 #define FFI_TARGET_HAS_COMPLEX_TYPE
 #endif
 
@@ -77,7 +77,7 @@ typedef enum ffi_abi {
 #endif
 
 #else
-#ifdef _MSC_VER
+#ifdef _WIN32
 #define FFI_TRAMPOLINE_SIZE 16
 #define FFI_TRAMPOLINE_CLOSURE_FUNCTION 12
 #else
diff --git a/src/arm/sysv.S b/src/arm/sysv.S
index e816e32..fb36213 100644
--- a/src/arm/sysv.S
+++ b/src/arm/sysv.S
@@ -92,9 +92,25 @@
 #define ARM_FUNC_END(name) \
 	SIZE(name)
 
+	.text
+	.syntax unified
+#if defined(_WIN32)
+	/* Windows on ARM is thumb-only */
+	.thumb
+#else
+	/* Keep the assembly in ARM mode in other cases, for simplicity
+	 * (to avoid interworking issues). */
+#undef __thumb__
+	.arm
+#endif
+
 /* Aid in defining a jump table with 8 bytes between entries.  */
+#ifdef __thumb__
+/* In thumb mode, instructions can be shorter than expected in arm mode, so
+ * we need to align the start of each case. */
+# define E(index) .align 3
+#elif defined(__clang__)
 /* ??? The clang assembler doesn't handle .if with symbolic expressions.  */
-#ifdef __clang__
 # define E(index)
 #else
 # define E(index)				\
@@ -103,9 +119,6 @@
 	.endif
 #endif
 
-	.text
-	.syntax unified
-	.arm
 
 #ifndef __clang__
 	/* We require interworking on LDM, which implies ARMv5T,
@@ -128,6 +141,7 @@ ARM_FUNC_START(ffi_call_VFP)
 	cfi_startproc
 
 	cmp	r3, #3			@ load only d0 if possible
+	ite	le
 #ifdef __clang__
 	vldrle d0, [r0]
 	vldmgt r0, {d0-d7}
@@ -167,9 +181,16 @@ ARM_FUNC_START(ffi_call_SYSV)
 	cfi_def_cfa_register(sp)
 
 	@ Store values stored in registers.
+#ifndef __thumb__
 	.align	3
 	add	pc, pc, r3, lsl #3
 	nop
+#else
+	adr	ip, 0f
+	add	ip, ip, r3, lsl #3
+	mov	pc, ip
+	.align	3
+#endif
 0:
 E(ARM_TYPE_VFP_S)
 #ifdef __clang__
@@ -228,6 +249,9 @@ ARM_FUNC_END(ffi_go_closure_SYSV)
 ARM_FUNC_START(ffi_closure_SYSV)
 	UNWIND(.fnstart)
 	cfi_startproc
+#ifdef _WIN32
+	ldmfd	sp!, {r0, ip}			@ restore fp (r0 is used for stack alignment)
+#endif
 	stmdb	sp!, {r0-r3}			@ save argument regs
 	cfi_adjust_cfa_offset(16)
 
@@ -256,7 +280,12 @@ ARM_FUNC_START(ffi_closure_SYSV)
 	@ Load values returned in registers.
 	add	r2, sp, #8+64			@ load result
 	adr	r3, CNAME(ffi_closure_ret)
+#ifndef __thumb__
 	add	pc, r3, r0, lsl #3
+#else
+	add	r3, r3, r0, lsl #3
+	mov	pc, r3
+#endif
 	cfi_endproc
 	UNWIND(.fnend)
 ARM_FUNC_END(ffi_closure_SYSV)
@@ -275,6 +304,9 @@ ARM_FUNC_END(ffi_go_closure_VFP)
 ARM_FUNC_START(ffi_closure_VFP)
 	UNWIND(.fnstart)
 	cfi_startproc
+#ifdef _WIN32
+	ldmfd	sp!, {r0, ip}			@ restore fp (r0 is used for stack alignment)
+#endif
 	stmdb	sp!, {r0-r3}			@ save argument regs
 	cfi_adjust_cfa_offset(16)
 
@@ -306,7 +338,12 @@ ARM_FUNC_START(ffi_closure_VFP)
 	@ Load values returned in registers.
 	add	r2, sp, #8+64			@ load result
 	adr	r3, CNAME(ffi_closure_ret)
+#ifndef __thumb__
 	add	pc, r3, r0, lsl #3
+#else
+	add	r3, r3, r0, lsl #3
+	mov	pc, r3
+#endif
 	cfi_endproc
 	UNWIND(.fnend)
 ARM_FUNC_END(ffi_closure_VFP)
@@ -326,32 +363,40 @@ E(ARM_TYPE_VFP_S)
 #else
 	ldc	p10, cr0, [r2]			@ vldr s0, [r2]
 #endif
-	ldm	sp, {sp,pc}
+	b	call_epilogue
 E(ARM_TYPE_VFP_D)
 #ifdef __clang__
 	vldr d0, [r2]
 #else
 	ldc	p11, cr0, [r2]			@ vldr d0, [r2]
 #endif
-	ldm	sp, {sp,pc}
+	b	call_epilogue
 E(ARM_TYPE_VFP_N)
 #ifdef __clang__
 	vldm r2, {d0-d3}
 #else
 	ldc	p11, cr0, [r2], {8}		@ vldm r2, {d0-d3}
 #endif
-	ldm	sp, {sp,pc}
+	b	call_epilogue
 E(ARM_TYPE_INT64)
 	ldr	r1, [r2, #4]
 	nop
 E(ARM_TYPE_INT)
 	ldr	r0, [r2]
-	ldm	sp, {sp,pc}
+	b	call_epilogue
 E(ARM_TYPE_VOID)
-	ldm	sp, {sp,pc}
+	b	call_epilogue
 	nop
 E(ARM_TYPE_STRUCT)
+	b	call_epilogue
+call_epilogue:
+#ifndef __thumb__
 	ldm	sp, {sp,pc}
+#else
+	ldm	sp, {ip,lr}
+	mov	sp, ip
+	bx	lr
+#endif
 	cfi_endproc
 ARM_FUNC_END(ffi_closure_ret)
 
@@ -419,6 +464,15 @@ ARM_FUNC_START(ffi_closure_trampoline_table_page)
 ARM_FUNC_END(ffi_closure_trampoline_table_page)
 #endif
 
+#elif defined(_WIN32)
+
+ARM_FUNC_START(ffi_arm_trampoline)
+0:	adr	ip, 0b
+	stmdb	sp!, {r0, ip}
+	ldr	pc, 1f
+1:	.long	0
+ARM_FUNC_END(ffi_arm_trampoline)
+
 #else
 
 ARM_FUNC_START(ffi_arm_trampoline)