Add assembler code for TT_MulFix14 and TT_DotFix14. This patch provides slightly optimized versions for ARM, x86, and x86_64 CPUs if built with GCC. Also remove some dead code. * src/truetype/ttinterp.c (TT_MulFix14_arm, TT_MulFix14_long_long, TT_DotFix14_long_long): New functions.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
diff --git a/ChangeLog b/ChangeLog
index 738a1ad..807a81c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,17 @@
2013-07-16 David Turner <digit@google.com>
+ [truetype] Add assembler code for TT_MulFix14 and TT_DotFix14.
+
+ This patch provides slightly optimized versions for ARM, x86, and
+ x86_64 CPUs if built with GCC.
+
+ Also remove some dead code.
+
+ * src/truetype/ttinterp.c (TT_MulFix14_arm, TT_MulFix14_long_long,
+ TT_DotFix14_long_long): New functions.
+
+2013-07-16 David Turner <digit@google.com>
+
Optimize FT_MulFix for x86_64 GCC builds.
This patch provides an optimized `FT_MulFix' implementation for
diff --git a/src/truetype/ttinterp.c b/src/truetype/ttinterp.c
index e7ffb98..5ed16d0 100644
--- a/src/truetype/ttinterp.c
+++ b/src/truetype/ttinterp.c
@@ -1437,9 +1437,100 @@
#undef PACK
-#if 1
+
+#ifndef FT_CONFIG_OPTION_NO_ASSEMBLER
+
+#if defined( __arm__ ) && \
+ ( defined( __thumb2__ ) || !defined( __thumb__ ) )
+
+#define TT_MulFix14 TT_MulFix14_arm
static FT_Int32
+ TT_MulFix14_arm( FT_Int32 a,
+ FT_Int b )
+ {
+ register FT_Int32 t, t2;
+
+
+#if defined( __CC_ARM ) || defined( __ARMCC__ )
+
+ __asm
+ {
+ smull t2, t, b, a /* (lo=t2,hi=t) = a*b */
+ mov a, t, asr #31 /* a = (hi >> 31) */
+ add a, a, #0x2000 /* a += 0x2000 */
+ adds t2, t2, a /* t2 += a */
+ adc t, t, #0 /* t += carry */
+ mov a, t2, lsr #14 /* a = t2 >> 14 */
+ orr a, a, t, lsl #18 /* a |= t << 18 */
+ }
+
+#elif defined( __GNUC__ )
+
+ __asm__ __volatile__ (
+ "smull %1, %2, %4, %3\n\t" /* (lo=%1,hi=%2) = a*b */
+ "mov %0, %2, asr #31\n\t" /* %0 = (hi >> 31) */
+ "add %0, %0, #0x2000\n\t" /* %0 += 0x2000 */
+ "adds %1, %1, %0\n\t" /* %1 += %0 */
+ "adc %2, %2, #0\n\t" /* %2 += carry */
+ "mov %0, %1, lsr #14\n\t" /* %0 = %1 >> 16 */
+ "orr %0, %0, %2, lsl #18\n\t" /* %0 |= %2 << 16 */
+ : "=r"(a), "=&r"(t2), "=&r"(t)
+ : "r"(a), "r"(b)
+ : "cc" );
+
+#endif
+
+ return a;
+ }
+
+#endif /* __arm__ && ( __thumb2__ || !__thumb__ ) */
+
+#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */
+
+
+#if defined( __GNUC__ ) && \
+ ( defined( __i386__ ) || defined( __x86_64__ ) )
+
+#define TT_MulFix14 TT_MulFix14_long_long
+
+ /* This is declared `noinline' because inlining the function results */
+ /* in slower code. The `pure' attribute indicates that the result */
+ /* only depends on the parameters. */
+ static __attribute__(( noinline ))
+ __attribute__(( pure )) FT_Int32
+ TT_MulFix14_long_long( FT_Int32 a,
+ FT_Int b )
+ {
+ /* Temporarily disable the warning that C90 doesn't support */
+ /* `long long'. */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
+
+ long long ret = (long long)a * b;
+
+ /* The following line assumes that right shifting of signed values */
+ /* will actually preserve the sign bit. The exact behaviour is */
+ /* undefined, but this is true on x86 and x86_64. */
+ long long tmp = ret >> 63;
+
+
+ ret += 0x2000 + tmp;
+
+ return (FT_Int32)( ret >> 14 );
+
+#pragma GCC diagnostic pop
+ }
+
+#endif /* __GNUC__ && ( __i386__ || __x86_64__ ) */
+
+
+#ifndef TT_MulFix14
+
+ /* Compute (a*b)/2^14 with maximum accuracy and rounding. */
+ /* This is optimized to be faster than calling FT_MulFix() */
+ /* for platforms where sizeof(int) == 2. */
+ static FT_Int32
TT_MulFix14( FT_Int32 a,
FT_Int b )
{
@@ -1470,37 +1561,44 @@
return sign >= 0 ? (FT_Int32)mid : -(FT_Int32)mid;
}
-#else
+#endif /* !TT_MulFix14 */
- /* compute (a*b)/2^14 with maximum accuracy and rounding */
- static FT_Int32
- TT_MulFix14( FT_Int32 a,
- FT_Int b )
- {
- FT_Int32 m, s, hi;
- FT_UInt32 l, lo;
+#if defined( __GNUC__ ) && \
+ ( defined( __i386__ ) || \
+ defined( __x86_64__ ) || \
+ defined( __arm__ ) )
- /* compute ax*bx as 64-bit value */
- l = (FT_UInt32)( ( a & 0xFFFFU ) * b );
- m = ( a >> 16 ) * b;
+#define TT_DotFix14 TT_DotFix14_long_long
- lo = l + ( (FT_UInt32)m << 16 );
- hi = ( m >> 16 ) + ( (FT_Int32)l >> 31 ) + ( lo < l );
+ static __attribute__(( pure )) FT_Int32
+ TT_DotFix14_long_long( FT_Int32 ax,
+ FT_Int32 ay,
+ FT_Int bx,
+ FT_Int by )
+ {
+ /* Temporarily disable the warning that C90 doesn't support */
+ /* `long long'. */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wlong-long"
- /* divide the result by 2^14 with rounding */
- s = hi >> 31;
- l = lo + (FT_UInt32)s;
- hi += s + ( l < lo );
- lo = l;
+ long long temp1 = (long long)ax * bx;
+ long long temp2 = (long long)ay * by;
- l = lo + 0x2000U;
- hi += l < lo;
- return (FT_Int32)( ( (FT_UInt32)hi << 18 ) | ( l >> 14 ) );
+ temp1 += temp2;
+ temp2 = temp1 >> 63;
+ temp1 += 0x2000 + temp2;
+
+ return (FT_Int32)( temp1 >> 14 );
+
+#pragma GCC diagnostic pop
}
-#endif
+#endif /* __GNUC__ && (__arm__ || __i386__ || __x86_64__) */
+
+
+#ifndef TT_DotFix14
/* compute (ax*bx+ay*by)/2^14 with maximum accuracy and rounding */
static FT_Int32
@@ -1543,6 +1641,8 @@
return (FT_Int32)( ( (FT_UInt32)hi << 18 ) | ( l >> 14 ) );
}
+#endif /* TT_DotFix14 */
+
/*************************************************************************/
/* */