Optimize FT_MulFix for x86_64 GCC builds. This patch provides an optimized `FT_MulFix' implementation for x86_64 machines when FreeType is built with GCC, or compatible compilers like Clang. Example: bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf Before: Load 4.863 us/op Load_Advances (Normal) 4.816 us/op Load_Advances (Fast) 0.028 us/op Render 2.753 us/op Get_Glyph 0.463 us/op Get_CBox 0.077 us/op Get_Char_Index 0.023 us/op Iterate CMap 13.898 us/op New_Face 12.368 us/op Embolden 0.028 us/op Get_BBox 0.302 us/op After: Load 4.617 us/op Load_Advances (Normal) 4.645 us/op Load_Advances (Fast) 0.027 us/op Render 2.789 us/op Get_Glyph 0.460 us/op Get_CBox 0.077 us/op Get_Char_Index 0.024 us/op Iterate CMap 13.403 us/op New_Face 12.278 us/op Embolden 0.028 us/op Get_BBox 0.301 us/op * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h (FT_MulFix_x86_64): New function.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
diff --git a/ChangeLog b/ChangeLog
index 41c0d0d..738a1ad 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,47 @@
2013-07-16 David Turner <digit@google.com>
+ Optimize FT_MulFix for x86_64 GCC builds.
+
+ This patch provides an optimized `FT_MulFix' implementation for
+ x86_64 machines when FreeType is built with GCC, or compatible
+ compilers like Clang.
+
+ Example:
+ bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf
+
+ Before:
+
+ Load 4.863 us/op
+ Load_Advances (Normal) 4.816 us/op
+ Load_Advances (Fast) 0.028 us/op
+ Render 2.753 us/op
+ Get_Glyph 0.463 us/op
+ Get_CBox 0.077 us/op
+ Get_Char_Index 0.023 us/op
+ Iterate CMap 13.898 us/op
+ New_Face 12.368 us/op
+ Embolden 0.028 us/op
+ Get_BBox 0.302 us/op
+
+ After:
+
+ Load 4.617 us/op
+ Load_Advances (Normal) 4.645 us/op
+ Load_Advances (Fast) 0.027 us/op
+ Render 2.789 us/op
+ Get_Glyph 0.460 us/op
+ Get_CBox 0.077 us/op
+ Get_Char_Index 0.024 us/op
+ Iterate CMap 13.403 us/op
+ New_Face 12.278 us/op
+ Embolden 0.028 us/op
+ Get_BBox 0.301 us/op
+
+ * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
+ (FT_MulFix_x86_64): New function.
+
+2013-07-16 David Turner <digit@google.com>
+
Speed up ARMv7 support.
When building for ARMv7 with thumb2 instructions, the optimized
diff --git a/builds/unix/ftconfig.in b/builds/unix/ftconfig.in
index c82fe5d..c373b9f 100644
--- a/builds/unix/ftconfig.in
+++ b/builds/unix/ftconfig.in
@@ -366,6 +366,7 @@ FT_BEGIN_HEADER
/* These must be defined `static __inline__' with GCC. */
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@@ -428,7 +429,9 @@ FT_BEGIN_HEADER
/* ( __thumb2__ || !__thumb__ ) && */
/* !( __CC_ARM || __ARMCC__ ) */
+
#if defined( __i386__ )
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
/* documentation is in freetype.h */
@@ -497,6 +500,62 @@ FT_BEGIN_HEADER
#endif /* _MSC_VER */
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
+
+ static __inline__ FT_Int32
+ FT_MulFix_x86_64( FT_Int32 a,
+ FT_Int32 b )
+ {
+ /* Temporarily disable the warning that C90 doesn't support */
+ /* `long long'. */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+ /* Technically not an assembly fragment, but GCC does a really good */
+ /* job at inlining it and generating good machine code for it. */
+ long long ret, tmp;
+
+
+ ret = (long long)a * b;
+ tmp = ret >> 63;
+ ret += 0x8000 + tmp;
+
+ return (FT_Int32)( ret >> 16 );
+#else
+
+ /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
+ /* code from the lines below. The main issue is that `wide_a' is not */
+ /* properly initialized by sign-extending `a'. Instead, the generated */
+ /* machine code assumes that the register that contains `a' on input */
+ /* can be used directly as a 64-bit value, which is wrong most of the */
+ /* time. */
+ long long wide_a = (long long)a;
+ long long wide_b = (long long)b;
+ long long result;
+
+
+ __asm__ __volatile__ (
+ "imul %2, %1\n"
+ "mov %1, %0\n"
+ "sar $63, %0\n"
+ "lea 0x8000(%1, %0), %0\n"
+ "sar $16, %0\n"
+ : "=&r"(result), "=&r"(wide_a)
+ : "r"(wide_b)
+ : "cc" );
+
+ return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+ }
+
+#endif /* __GNUC__ && __x86_64__ */
+
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */
diff --git a/include/freetype/config/ftconfig.h b/include/freetype/config/ftconfig.h
index 3349e29..ab1e7a5 100644
--- a/include/freetype/config/ftconfig.h
+++ b/include/freetype/config/ftconfig.h
@@ -338,6 +338,7 @@ FT_BEGIN_HEADER
/* These must be defined `static __inline__' with GCC. */
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@@ -370,6 +371,7 @@ FT_BEGIN_HEADER
#if defined( __arm__ ) && \
( !defined( __thumb__ ) || defined( __thumb2__ ) ) && \
!( defined( __CC_ARM ) || defined( __ARMCC__ ) )
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@@ -399,7 +401,9 @@ FT_BEGIN_HEADER
/* ( __thumb2__ || !__thumb__ ) && */
/* !( __CC_ARM || __ARMCC__ ) */
+
#if defined( __i386__ )
+
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
/* documentation is in freetype.h */
@@ -468,6 +472,62 @@ FT_BEGIN_HEADER
#endif /* _MSC_VER */
+
+#if defined( __GNUC__ ) && defined( __x86_64__ )
+
+#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
+
+ static __inline__ FT_Int32
+ FT_MulFix_x86_64( FT_Int32 a,
+ FT_Int32 b )
+ {
+ /* Temporarily disable the warning that C90 doesn't support */
+ /* `long long'. */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+#if 1
+ /* Technically not an assembly fragment, but GCC does a really good */
+ /* job at inlining it and generating good machine code for it. */
+ long long ret, tmp;
+
+
+ ret = (long long)a * b;
+ tmp = ret >> 63;
+ ret += 0x8000 + tmp;
+
+ return (FT_Int32)( ret >> 16 );
+#else
+
+ /* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
+ /* code from the lines below. The main issue is that `wide_a' is not */
+ /* properly initialized by sign-extending `a'. Instead, the generated */
+ /* machine code assumes that the register that contains `a' on input */
+ /* can be used directly as a 64-bit value, which is wrong most of the */
+ /* time. */
+ long long wide_a = (long long)a;
+ long long wide_b = (long long)b;
+ long long result;
+
+
+ __asm__ __volatile__ (
+ "imul %2, %1\n"
+ "mov %1, %0\n"
+ "sar $63, %0\n"
+ "lea 0x8000(%1, %0), %0\n"
+ "sar $16, %0\n"
+ : "=&r"(result), "=&r"(wide_a)
+ : "r"(wide_b)
+ : "cc" );
+
+ return (FT_Int32)result;
+#endif
+
+#pragma GCC diagnostic pop
+ }
+
+#endif /* __GNUC__ && __x86_64__ */
+
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */