Commit c9d7c03fa1b5c6244759ade814c546da04080646

Behdad Esfahbod 2017-08-15T08:48:17

[sfnt] Speed up PNG image loading. This reduces the overhead of `premultiply_data' by 60%. * src/sfnt/pngshim.c (premultiply_data): Provide code which uses gcc's (and clang's) `vector_byte' attribute to process 4 pixels at a time.

diff --git a/ChangeLog b/ChangeLog
index 4091498..06ea99e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2017-08-15  Behdad Esfahbod  <behdad@behdad.org>
+
+	[sfnt] Speed up PNG image loading.
+
+	This reduces the overhead of `premultiply_data' by 60%.
+
+	* src/sfnt/pngshim.c (premultiply_data): Provide code which uses
+	gcc's (and clang's) `vector_byte' attribute to process 4 pixels at a
+	time.
+
 2017-08-11  Werner Lemberg  <wl@gnu.org>
 
 	[sfnt, truetype] Improve handling of missing sbits.
diff --git a/src/sfnt/pngshim.c b/src/sfnt/pngshim.c
index b9b296e..90a5669 100644
--- a/src/sfnt/pngshim.c
+++ b/src/sfnt/pngshim.c
@@ -49,18 +49,65 @@
   }
 
 
-  /* Premultiplies data and converts RGBA bytes => native endian. */
+  /* Premultiplies data and converts RGBA bytes => BGRA. */
   static void
   premultiply_data( png_structp    png,
                     png_row_infop  row_info,
                     png_bytep      data )
   {
-    unsigned int  i;
+    unsigned int  i = 0, limit;
 
     FT_UNUSED( png );
 
+    /* the `vector_size' attribute was introduced in gcc 3.1, which */
+    /* predates clang; the `__BYTE_ORDER__' preprocessor symbol was */
+    /* introduced in gcc 4.6 and clang 3.2, respectively            */
+#if ( ( defined( __GNUC__ )                                &&             \
+        ( ( __GNUC__ >= 5 )                              ||               \
+        ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 6 ) ) ) )         ||   \
+      ( defined( __clang__ )                                       &&     \
+        ( ( __clang_major__ >= 4 )                               ||       \
+        ( ( __clang_major__ == 3 ) && ( __clang_minor__ >= 2 ) ) ) ) ) && \
+    defined( __OPTIMIZE__ )                                            && \
+    __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 
-    for ( i = 0; i < row_info->rowbytes; i += 4 )
+    typedef unsigned short  v82 __attribute__(( vector_size( 16 ) ));
+
+
+    /* process blocks of 16 bytes in one rush, which gives a nice speed-up */
+    limit = row_info->rowbytes - 16 + 1;
+    for ( ; i < limit; i += 16 )
+    {
+      char*  base = &data[i];
+
+      v82  s, s0, s1, a;
+      v82  ma = { 1, 1, 3, 3, 5, 5, 7, 7 };
+      v82  o1 = { 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF };
+      v82  m0 = { 1, 0, 3, 2, 5, 4, 7, 6 };
+
+
+      memcpy( &s, base, 16 );               /* RGBA RGBA RGBA RGBA */
+      s0 = s & 0xFF;                        /*  R B  R B  R B  R B */
+      s1 = s >> 8;                          /*  G A  G A  G A  G A */
+
+      a  = __builtin_shuffle( s1, ma );     /*  A A  A A  A A  A A */
+      s1 |= o1;                             /*  G 1  G 1  G 1  G 1 */
+      s0 = __builtin_shuffle( s0, m0 );     /*  B R  B R  B R  B R */
+
+      s0 *= a;
+      s1 *= a;
+      s0 += 0x80;
+      s1 += 0x80;
+      s0 = ( s0 + ( s0 >> 8 ) ) >> 8;
+      s1 = ( s1 + ( s1 >> 8 ) ) >> 8;
+
+      s = s0 | ( s1 << 8 );
+      memcpy( base, &s, 16 );
+    }
+#endif /* use `vector_size' */
+
+    limit = row_info->rowbytes;
+    for ( ; i < limit; i += 4 )
     {
       unsigned char*  base  = &data[i];
       unsigned int    alpha = base[3];