Commit b28908860d2001f1c66627e0ec024a01e5e9af7c

David Turner 2013-07-16T12:52:18

Optimize FT_MulFix for x86_64 GCC builds. This patch provides an optimized `FT_MulFix' implementation for x86_64 machines when FreeType is built with GCC, or compatible compilers like Clang. Example: bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf Before: Load 4.863 us/op Load_Advances (Normal) 4.816 us/op Load_Advances (Fast) 0.028 us/op Render 2.753 us/op Get_Glyph 0.463 us/op Get_CBox 0.077 us/op Get_Char_Index 0.023 us/op Iterate CMap 13.898 us/op New_Face 12.368 us/op Embolden 0.028 us/op Get_BBox 0.302 us/op After: Load 4.617 us/op Load_Advances (Normal) 4.645 us/op Load_Advances (Fast) 0.027 us/op Render 2.789 us/op Get_Glyph 0.460 us/op Get_CBox 0.077 us/op Get_Char_Index 0.024 us/op Iterate CMap 13.403 us/op New_Face 12.278 us/op Embolden 0.028 us/op Get_BBox 0.301 us/op * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h (FT_MulFix_x86_64): New function.

diff --git a/ChangeLog b/ChangeLog
index 41c0d0d..738a1ad 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,47 @@
 2013-07-16  David Turner  <digit@google.com>
 
+	Optimize FT_MulFix for x86_64 GCC builds.
+
+	This patch provides an optimized `FT_MulFix' implementation for
+	x86_64 machines when FreeType is built with GCC, or compatible
+	compilers like Clang.
+
+	Example:
+	  bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf
+
+	Before:
+
+	  Load                       4.863 us/op
+	  Load_Advances (Normal)     4.816 us/op
+	  Load_Advances (Fast)       0.028 us/op
+	  Render                     2.753 us/op
+	  Get_Glyph                  0.463 us/op
+	  Get_CBox                   0.077 us/op
+	  Get_Char_Index             0.023 us/op
+	  Iterate CMap              13.898 us/op
+	  New_Face                  12.368 us/op
+	  Embolden                   0.028 us/op
+	  Get_BBox                   0.302 us/op
+
+	After:
+
+	  Load                       4.617 us/op
+	  Load_Advances (Normal)     4.645 us/op
+	  Load_Advances (Fast)       0.027 us/op
+	  Render                     2.789 us/op
+	  Get_Glyph                  0.460 us/op
+	  Get_CBox                   0.077 us/op
+	  Get_Char_Index             0.024 us/op
+	  Iterate CMap              13.403 us/op
+	  New_Face                  12.278 us/op
+	  Embolden                   0.028 us/op
+	  Get_BBox                   0.301 us/op
+
+	* builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
+	(FT_MulFix_x86_64): New function.
+
+2013-07-16  David Turner  <digit@google.com>
+
 	Speed up ARMv7 support.
 
 	When building for ARMv7 with thumb2 instructions, the optimized
diff --git a/builds/unix/ftconfig.in b/builds/unix/ftconfig.in
index c82fe5d..c373b9f 100644
--- a/builds/unix/ftconfig.in
+++ b/builds/unix/ftconfig.in
@@ -366,6 +366,7 @@ FT_BEGIN_HEADER
   /* These must be defined `static __inline__' with GCC.             */
 
 #if defined( __CC_ARM ) || defined( __ARMCC__ )  /* RVCT */
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_arm
 
   /* documentation is in freetype.h */
@@ -428,7 +429,9 @@ FT_BEGIN_HEADER
        /* ( __thumb2__ || !__thumb__ ) && */
        /* !( __CC_ARM || __ARMCC__ )      */
 
+
 #if defined( __i386__ )
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_i386
 
   /* documentation is in freetype.h */
@@ -497,6 +500,62 @@ FT_BEGIN_HEADER
 
 #endif /* _MSC_VER */
 
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER  FT_MulFix_x86_64
+
+  static __inline__ FT_Int32
+  FT_MulFix_x86_64( FT_Int32  a,
+                    FT_Int32  b )
+  {
+    /* Temporarily disable the warning that C90 doesn't support */
+    /* `long long'.                                             */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+    /* Technically not an assembly fragment, but GCC does a really good */
+    /* job at inlining it and generating good machine code for it.      */
+    long long  ret, tmp;
+
+
+    ret  = (long long)a * b;
+    tmp  = ret >> 63;
+    ret += 0x8000 + tmp;
+
+    return (FT_Int32)( ret >> 16 );
+#else
+
+    /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine  */
+    /* code from the lines below.  The main issue is that `wide_a' is not  */
+    /* properly initialized by sign-extending `a'.  Instead, the generated */
+    /* machine code assumes that the register that contains `a' on input   */
+    /* can be used directly as a 64-bit value, which is wrong most of the  */
+    /* time.                                                               */
+    long long  wide_a = (long long)a;
+    long long  wide_b = (long long)b;
+    long long  result;
+
+
+    __asm__ __volatile__ (
+      "imul %2, %1\n"
+      "mov %1, %0\n"
+      "sar $63, %0\n"
+      "lea 0x8000(%1, %0), %0\n"
+      "sar $16, %0\n"
+      : "=&r"(result), "=&r"(wide_a)
+      : "r"(wide_b)
+      : "cc" );
+
+    return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+  }
+
+#endif /* __GNUC__ && __x86_64__ */
+
 #endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */
 
 
diff --git a/include/freetype/config/ftconfig.h b/include/freetype/config/ftconfig.h
index 3349e29..ab1e7a5 100644
--- a/include/freetype/config/ftconfig.h
+++ b/include/freetype/config/ftconfig.h
@@ -338,6 +338,7 @@ FT_BEGIN_HEADER
   /* These must be defined `static __inline__' with GCC.             */
 
 #if defined( __CC_ARM ) || defined( __ARMCC__ )  /* RVCT */
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_arm
 
   /* documentation is in freetype.h */
@@ -370,6 +371,7 @@ FT_BEGIN_HEADER
 #if defined( __arm__ )                                 && \
     ( !defined( __thumb__ ) || defined( __thumb2__ ) ) && \
     !( defined( __CC_ARM ) || defined( __ARMCC__ ) )
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_arm
 
   /* documentation is in freetype.h */
@@ -399,7 +401,9 @@ FT_BEGIN_HEADER
        /* ( __thumb2__ || !__thumb__ ) && */
        /* !( __CC_ARM || __ARMCC__ )      */
 
+
 #if defined( __i386__ )
+
 #define FT_MULFIX_ASSEMBLER  FT_MulFix_i386
 
   /* documentation is in freetype.h */
@@ -468,6 +472,62 @@ FT_BEGIN_HEADER
 
 #endif /* _MSC_VER */
 
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER  FT_MulFix_x86_64
+
+  static __inline__ FT_Int32
+  FT_MulFix_x86_64( FT_Int32  a,
+                    FT_Int32  b )
+  {
+    /* Temporarily disable the warning that C90 doesn't support */
+    /* `long long'.                                             */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+    /* Technically not an assembly fragment, but GCC does a really good */
+    /* job at inlining it and generating good machine code for it.      */
+    long long  ret, tmp;
+
+
+    ret  = (long long)a * b;
+    tmp  = ret >> 63;
+    ret += 0x8000 + tmp;
+
+    return (FT_Int32)( ret >> 16 );
+#else
+
+    /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine  */
+    /* code from the lines below.  The main issue is that `wide_a' is not  */
+    /* properly initialized by sign-extending `a'.  Instead, the generated */
+    /* machine code assumes that the register that contains `a' on input   */
+    /* can be used directly as a 64-bit value, which is wrong most of the  */
+    /* time.                                                               */
+    long long  wide_a = (long long)a;
+    long long  wide_b = (long long)b;
+    long long  result;
+
+
+    __asm__ __volatile__ (
+      "imul %2, %1\n"
+      "mov %1, %0\n"
+      "sar $63, %0\n"
+      "lea 0x8000(%1, %0), %0\n"
+      "sar $16, %0\n"
+      : "=&r"(result), "=&r"(wide_a)
+      : "r"(wide_b)
+      : "cc" );
+
+    return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+  }
+
+#endif /* __GNUC__ && __x86_64__ */
+
 #endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */