Hash :
ca8c1693
Author :
Date :
2012-08-27T16:25:43
Moved SSE2 code to separated file to allow using different compiler flags on GCC To use SSE2 in GCC, a compiler -msse2 flag is needed. It adds both possibility to write SSE2 code in the file, but also allows compiles to optimize the whole file with SSE2. To make sure that the code works on non-SSE2 machines, moving SSE2 functions to separated file and compiling only that file with -msse2 is the common practice. Issue=358 Signed-off-by: Daniel Koch git-svn-id: https://angleproject.googlecode.com/svn/trunk@1262 736b8ea6-26fd-11df-bfd4-992fa37f6226
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
//
// Copyright (c) 2002-2010 The ANGLE Project Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// TextureSSE2.cpp: Implements SSE2-based functions of gl::Image class. It's
// in a separated file for GCC, which can enable SSE usage only per-file,
// not for code blocks that use SSE2 explicitly.
#include "libGLESv2/Texture.h"
#include <intrin.h>
namespace gl
{
void Image::loadRGBAUByteDataSSE2(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const
{
const unsigned int *source = NULL;
unsigned int *dest = NULL;
__m128i brMask = _mm_set1_epi32(0x00ff00ff);
for (int y = 0; y < height; y++)
{
source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch);
dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
int x = 0;
// Make output writes aligned
for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
{
unsigned int rgba = source[x];
dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
}
for (; x + 3 < width; x += 4)
{
__m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
// Mask out g and a, which don't change
__m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
// Mask out b and r
__m128i brComponents = _mm_and_si128(sourceData, brMask);
// Swap b and r
__m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
__m128i result = _mm_or_si128(gaComponents, brSwapped);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
}
// Perform leftover writes
for (; x < width; x++)
{
unsigned int rgba = source[x];
dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
}
}
}
void Image::loadAlphaDataSSE2(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const
{
const unsigned char *source = NULL;
unsigned int *dest = NULL;
__m128i zeroWide = _mm_setzero_si128();
for (int y = 0; y < height; y++)
{
source = static_cast<const unsigned char*>(input) + y * inputPitch;
dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
int x;
// Make output writes aligned
for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
{
dest[x] = static_cast<unsigned int>(source[x]) << 24;
}
for (; x + 7 < width; x += 8)
{
__m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
// Interleave each byte to 16bit, make the lower byte to zero
sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
// Interleave each 16bit to 32bit, make the lower 16bit to zero
__m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
__m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
}
// Handle the remainder
for (; x < width; x++)
{
dest[x] = static_cast<unsigned int>(source[x]) << 24;
}
}
}
}