Edit

kc3-lang/angle/src/libANGLE/renderer/d3d/loadimageSSE2.cpp

Branch :

  • Show log

    Commit

  • Author : Geoff Lang
    Date : 2015-01-05 15:02:18
    Hash : c2ca196e
    Message : Move loadimage to the d3d folder. BUG=angle:681 Change-Id: Id901d9ae66207ac5e7d2844b8cd1f6a58c4492f6 Reviewed-on: https://chromium-review.googlesource.com/238478 Reviewed-by: Geoff Lang <geofflang@chromium.org> Tested-by: Geoff Lang <geofflang@chromium.org>

  • src/libANGLE/renderer/d3d/loadimageSSE2.cpp
  • //
    // Copyright (c) 2002-2015 The ANGLE Project Authors. All rights reserved.
    // Use of this source code is governed by a BSD-style license that can be
    // found in the LICENSE file.
    //
    
    // loadimageSSE2.cpp: Defines image loading functions. It's
    // in a separated file for GCC, which can enable SSE usage only per-file,
    // not for code blocks that use SSE2 explicitly.
    
    #include "libANGLE/renderer/d3d/loadimage.h"
    
    #include "common/platform.h"
    
    #ifdef ANGLE_USE_SSE
    #include <emmintrin.h>
    #endif
    
    namespace rx
    {
    
    void LoadA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
                            const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
                            uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
    {
    #if defined(ANGLE_USE_SSE)
        __m128i zeroWide = _mm_setzero_si128();
    
        for (size_t z = 0; z < depth; z++)
        {
            for (size_t y = 0; y < height; y++)
            {
                const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
                uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
    
                size_t x = 0;
    
                // Make output writes aligned
                for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
                {
                    dest[x] = static_cast<uint32_t>(source[x]) << 24;
                }
    
                for (; x + 7 < width; x += 8)
                {
                    __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
                    // Interleave each byte to 16bit, make the lower byte to zero
                    sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
                    // Interleave each 16bit to 32bit, make the lower 16bit to zero
                    __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
                    __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
    
                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
                }
    
                // Handle the remainder
                for (; x < width; x++)
                {
                    dest[x] = static_cast<uint32_t>(source[x]) << 24;
                }
            }
        }
    #else
        // Ensure that this function is reported as not implemented for ARM builds because
        // the instructions below are not present for that architecture.
        UNIMPLEMENTED();
        return;
    #endif
    }
    
    void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
                               const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
                               uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
    {
    #if defined(ANGLE_USE_SSE)
        __m128i brMask = _mm_set1_epi32(0x00ff00ff);
    
        for (size_t z = 0; z < depth; z++)
        {
            for (size_t y = 0; y < height; y++)
            {
                const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
                uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
    
                size_t x = 0;
    
                // Make output writes aligned
                for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
                {
                    uint32_t rgba = source[x];
                    dest[x] = (ANGLE_ROTL(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
                }
    
                for (; x + 3 < width; x += 4)
                {
                    __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
                    // Mask out g and a, which don't change
                    __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
                    // Mask out b and r
                    __m128i brComponents = _mm_and_si128(sourceData, brMask);
                    // Swap b and r
                    __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
                    __m128i result = _mm_or_si128(gaComponents, brSwapped);
                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
                }
    
                // Perform leftover writes
                for (; x < width; x++)
                {
                    uint32_t rgba = source[x];
                    dest[x] = (ANGLE_ROTL(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
                }
            }
        }
    #else
        // Ensure that this function is reported as not implemented for ARM builds because
        // the instructions below are not present for that architecture.
        UNIMPLEMENTED();
        return;
    #endif
    }
    
    }