Edit

IABSD.fr/xenocara/driver/xf86-video-openchrome/src/via_memcpy.c

Branch :

  • Show log

    Commit

  • Author : fcambus
    Date : 2021-11-23 17:50:30
    Hash : 604ad564
    Message : Update openchrome to 0.6.409. OK and with help from matthieu@

  • driver/xf86-video-openchrome/src/via_memcpy.c
  • /*
     * Copyright (C) 2004 Thomas Hellström, All Rights Reserved.
     *
     * Permission is hereby granted, free of charge, to any person obtaining a
     * copy of this software and associated documentation files (the "Software"),
     * to deal in the Software without restriction, including without limitation
     * the rights to use, copy, modify, merge, publish, distribute, sub license,
     * and/or sell copies of the Software, and to permit persons to whom the
     * Software is furnished to do so, subject to the following conditions:
     *
     * The above copyright notice and this permission notice (including the
     * next paragraph) shall be included in all copies or substantial portions
     * of the Software.
     *
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     * DEALINGS IN THE SOFTWARE.
     */
    
    #ifdef HAVE_CONFIG_H
    #include "config.h"
    #endif
    
    #include "via_driver.h"
    #include "compiler.h"
    
    
    #define BSIZ 2048  /* size of /proc/cpuinfo buffer */
    #define BSIZW 720  /* typical copy width (YUV420) */
    #define BSIZA 736  /* multiple of 32 bytes */
    #define BSIZH 576  /* typical copy height */
    
    #define SSE_PREFETCH "  prefetchnta "
    #define FENCE __asm__ __volatile__ ("sfence":::"memory");
    #define FENCEMMS __asm__ __volatile__ ("\t"		\
    				       "sfence\n\t"	\
    				       "emms\n\t"	\
    				       :::"memory");
    #define FEMMS __asm__ __volatile__("femms":::"memory");
    #define EMMS __asm__ __volatile__("emms":::"memory");
    
    #define NOW_PREFETCH "  prefetch "
    
    
    #define PREFETCH1(arch_prefetch,from)			\
        __asm__ __volatile__ (				\
    			  "1:  " arch_prefetch "(%0)\n"	\
    			  arch_prefetch "32(%0)\n"	\
    			  arch_prefetch "64(%0)\n"	\
    			  arch_prefetch "96(%0)\n"	\
    			  arch_prefetch "128(%0)\n"	\
    			  arch_prefetch "160(%0)\n"	\
    			  arch_prefetch "192(%0)\n"	\
    			  arch_prefetch "256(%0)\n"	\
    			  arch_prefetch "288(%0)\n"	\
    			  "2:\n"			\
    			  : : "r" (from) );
    
    #define PREFETCH2(arch_prefetch,from)			\
        __asm__ __volatile__ (				\
    			  arch_prefetch "320(%0)\n"	\
    			  : : "r" (from) );
    #define PREFETCH3(arch_prefetch,from)			\
        __asm__ __volatile__ (				\
    			  arch_prefetch "288(%0)\n"	\
    			  : : "r" (from) );
    
    
    #define small_memcpy(to, from, n)					\
        {									\
    	__asm__ __volatile__(						\
    			     "movl %2,%%ecx\n\t"			\
    			     "sarl $2,%%ecx\n\t"			\
    			     "rep ; movsl\n\t"				\
    			     "testb $2,%b2\n\t"				\
    			     "je 1f\n\t"				\
    			     "movsw\n"					\
    			     "1:\ttestb $1,%b2\n\t"			\
    			     "je 2f\n\t"				\
    			     "movsb\n"					\
    			     "2:"					\
    			     :"=&D" (to), "=&S" (from)			\
    			     :"q" (n),"0" ((long) to),"1" ((long) from) \
    			     : "%ecx","memory");			\
        }
    
    
    #define SSE_CPY(prefetch, from, to, dummy, lcnt)			\
        if ((unsigned long) from & 15) {					\
    	__asm__ __volatile__ (						\
    			      "1:\n"					\
    			      prefetch "320(%1)\n"			\
    			      "  movups (%1), %%xmm0\n"			\
    			      "  movups 16(%1), %%xmm1\n"		\
    			      "  movntps %%xmm0, (%0)\n"		\
    			      "  movntps %%xmm1, 16(%0)\n"		\
                                  prefetch "352(%1)\n"			\
    			      "  movups 32(%1), %%xmm2\n"		\
    			      "  movups 48(%1), %%xmm3\n"		\
    			      "  movntps %%xmm2, 32(%0)\n"		\
    			      "  movntps %%xmm3, 48(%0)\n"		\
    			      "  addl $64,%0\n"				\
    			      "  addl $64,%1\n"				\
    			      "  decl %2\n"				\
    			      "  jne 1b\n"				\
    			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
    			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
        } else {								\
    	__asm__ __volatile__ (						\
    			      "2:\n"					\
    			      prefetch "320(%1)\n"			\
    			      "  movaps (%1), %%xmm0\n"			\
    			      "  movaps 16(%1), %%xmm1\n"		\
    			      "  movntps %%xmm0, (%0)\n"		\
    			      "  movntps %%xmm1, 16(%0)\n"		\
    			      prefetch "352(%1)\n"			\
    			      "  movaps 32(%1), %%xmm2\n"		\
    			      "  movaps 48(%1), %%xmm3\n"		\
    			      "  movntps %%xmm2, 32(%0)\n"		\
    			      "  movntps %%xmm3, 48(%0)\n"		\
    			      "  addl $64,%0\n"				\
    			      "  addl $64,%1\n"				\
    			      "  decl %2\n"				\
    			      "  jne 2b\n"				\
    			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
    			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
        }
    
    #define MMX_CPY(prefetch, from, to, dummy, lcnt)			\
        __asm__ __volatile__ (						\
    			  "1:\n"					\
    			  prefetch "320(%1)\n"				\
    			  "2:  movq (%1), %%mm0\n"			\
    			  "  movq 8(%1), %%mm1\n"			\
    			  "  movq 16(%1), %%mm2\n"			\
    			  "  movq 24(%1), %%mm3\n"			\
    			  "  movq %%mm0, (%0)\n"			\
    			  "  movq %%mm1, 8(%0)\n"			\
    			  "  movq %%mm2, 16(%0)\n"			\
    			  "  movq %%mm3, 24(%0)\n"			\
    			  prefetch "352(%1)\n"				\
    			  "  movq 32(%1), %%mm0\n"			\
    			  "  movq 40(%1), %%mm1\n"			\
    			  "  movq 48(%1), %%mm2\n"			\
    			  "  movq 56(%1), %%mm3\n"			\
    			  "  movq %%mm0, 32(%0)\n"			\
    			  "  movq %%mm1, 40(%0)\n"			\
    			  "  movq %%mm2, 48(%0)\n"			\
    			  "  movq %%mm3, 56(%0)\n"			\
    			  "  addl $64,%0\n"				\
    			  "  addl $64,%1\n"				\
    			  "  decl %2\n"					\
    			  "  jne 1b\n"					\
    			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
    			  :"0" (to), "1" (from), "2" (lcnt) : "memory");
    
    #define MMXEXT_CPY(prefetch, from, to, dummy, lcnt)			\
        __asm__ __volatile__ (						\
    			  ".p2align 4,,7\n"				\
    			  "1:\n"					\
    			  prefetch "320(%1)\n"				\
    			  "  movq (%1), %%mm0\n"			\
    			  "  movq 8(%1), %%mm1\n"			\
    			  "  movq 16(%1), %%mm2\n"			\
    			  "  movq 24(%1), %%mm3\n"			\
    			  "  movntq %%mm0, (%0)\n"			\
    			  "  movntq %%mm1, 8(%0)\n"			\
    			  "  movntq %%mm2, 16(%0)\n"			\
    			  "  movntq %%mm3, 24(%0)\n"			\
    			  prefetch "352(%1)\n"				\
    			  "  movq 32(%1), %%mm0\n"			\
    			  "  movq 40(%1), %%mm1\n"			\
    			  "  movq 48(%1), %%mm2\n"			\
    			  "  movq 56(%1), %%mm3\n"			\
    			  "  movntq %%mm0, 32(%0)\n"			\
    			  "  movntq %%mm1, 40(%0)\n"			\
    			  "  movntq %%mm2, 48(%0)\n"			\
    			  "  movntq %%mm3, 56(%0)\n"			\
    			  "  addl $64,%0\n"				\
    			  "  addl $64,%1\n"				\
    			  "  decl %2\n"					\
    			  "  jne 1b\n"					\
    			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
    			  :"0" (to), "1" (from), "2" (lcnt) : "memory");
    
    
    #define PREFETCH_FUNC(prefix, itype, ptype, begin, fence)		\
    									\
        static void prefix##_YUV42X(unsigned char *to,			\
    				const unsigned char *from,		\
    				int dstPitch,				\
    				int w,					\
    				int h,					\
    				int yuv422)				\
        {									\
    	int dadd, rest, count, hc, lcnt;				\
    	register int dummy;						\
    	PREFETCH1(ptype##_PREFETCH, from);				\
    	begin;								\
    	count = 2;							\
    									\
    	/* If destination pitch equals width, do it all in one go. */	\
    									\
    	if (yuv422) {							\
    	    w <<= 1;							\
    	    if (w == dstPitch) {					\
    		w *= h;							\
    		h = 1;							\
    		dstPitch = w;						\
    		count = 0;						\
    	    } else {							\
    		h -= 1;							\
    		count = 1;						\
    	    }								\
    	} else if (w == dstPitch) {					\
    	    w = h*(w + (w >> 1));					\
    	    count = 0;							\
    	    h = 1;							\
    	    dstPitch = w;						\
    	}								\
    									\
    	lcnt = w >> 6;							\
    	rest = w & 63;							\
    	while (count--) {						\
    	    hc = h;							\
    	    lcnt = w >> 6;						\
    	    rest = w & 63;						\
    	    dadd = dstPitch - w;					\
    	    while (hc--) {						\
    		if (lcnt) {						\
    		    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \
    		}							\
    		if (rest) {						\
    		    PREFETCH2(ptype##_PREFETCH, from);			\
    		    small_memcpy(to, from, rest);			\
    		    PREFETCH3(ptype##_PREFETCH, from);			\
    		}							\
    		to += dadd;						\
    	    }								\
    	    w >>= 1;							\
    	    dstPitch >>= 1;						\
    	    h -= 1;							\
    	}								\
    	if (lcnt > 5) {							\
    	    lcnt -= 5;							\
    	    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt);	\
    	    lcnt = 5;							\
    	}								\
    	if (lcnt) {							\
    	    itype##_CPY("#", from, to, dummy, lcnt);			\
    	}								\
    	if (rest) small_memcpy(to, from, rest);				\
    	fence;								\
        }
    
    #define NOPREFETCH_FUNC(prefix, itype, begin, fence)			\
        static void prefix##_YUV42X(unsigned char *to,			\
    				const unsigned char *from,		\
    				int dstPitch,				\
    				int w,					\
    				int h,					\
    				int yuv422)				\
    									\
        {									\
    	int dadd, rest, count, hc, lcnt;				\
    	register int dummy;						\
    	begin;								\
    	count = 2;							\
    									\
    	/* If destination pitch equals width, do it all in one go. */	\
    									\
    	if (yuv422) {							\
    	    w <<= 1;							\
    	    count = 1;							\
    	    if (w == dstPitch) {					\
    		w *= h;							\
    		h = 1;							\
    		dstPitch = w;						\
    	    }								\
    	} else if (w == dstPitch) {					\
    	    w = h*(w + (w >> 1));					\
    	    count = 1;							\
    	    h = 1;							\
    	    dstPitch = w;						\
    	}								\
    									\
    	lcnt = w >> 6;							\
    	rest = w & 63;							\
    	while (count--) {						\
    	    hc = h;							\
    	    dadd = dstPitch - w;					\
    	    lcnt = w >> 6;						\
    	    rest = w & 63;						\
    	    while (hc--) {						\
    		if (lcnt) {						\
    		    itype##_CPY("#", from, to, dummy, lcnt);		\
    		}							\
    		if (rest) small_memcpy(to, from, rest);			\
    		to += dadd;						\
    	    }								\
    	    w >>= 1;							\
    	    dstPitch >>= 1;						\
    	}								\
    	fence;								\
        }
    
    
    static void
    libc_YUV42X(unsigned char *dst, const unsigned char *src,
                int dstPitch, int w, int h, int yuv422)
    {
        if (yuv422)
            w <<= 1;
        if (dstPitch == w) {
            int size = h * ((yuv422) ? w : (w + (w >> 1)));
    
            memcpy(dst, src, size);
            return;
        } else {
            int count;
    
            /* Copy Y component to video memory. */
            count = h;
            while (count--) {
                memcpy(dst, src, w);
                src += w;
                dst += dstPitch;
            }
    
            /* UV component is 1/2 of Y. */
            if (!yuv422) {
                w >>= 1;
                dstPitch >>= 1;
    
                /* Copy V(Cr),U(Cb) components to video memory. */
                count = h;
                while (count--) {
                    memcpy(dst, src, w);
                    src += w;
                    dst += dstPitch;
                }
            }
        }
    }
    
    #ifdef __i386__
    
    /* Linux kernel __memcpy. */
    static __inline void *
    __memcpy(void *to, const void *from, size_t n)
    {
        int d1, d2, d3;
    
        __asm__ __volatile__(
                             "rep ; movsl\n\t"
                             "testb $2,%b4\n\t"
                             "je 1f\n\t"
                             "movsw\n"
                             "1:\ttestb $1,%b4\n\t"
                             "je 2f\n\t"
                             "movsb\n"
                             "2:"
                             :"=&c"(d1), "=&D"(d2), "=&S"(d3)
                             :"0"(n >> 2), "q"(n), "1"((long)to), "2"((long)from)
                             :"memory");
    
        return (to);
    }
    
    
    static void
    kernel_YUV42X(unsigned char *dst, const unsigned char *src,
                  int dstPitch, int w, int h, int yuv422)
    {
        if (yuv422)
            w <<= 1;
        if (dstPitch == w) {
            int size = h * ((yuv422) ? w : (w + (w >> 1)));
    
            __memcpy(dst, src, size);
            return;
        } else {
            int count;
    
            /* Copy Y component to video memory. */
            count = h;
            while (count--) {
                __memcpy(dst, src, w);
                src += w;
                dst += dstPitch;
            }
    
            /* UV component is 1/2 of Y. */
            if (!yuv422) {
    
                w >>= 1;
                dstPitch >>= 1;
    
                /* Copy V(Cr),U(Cb) components to video memory. */
                count = h;
                while (count--) {
                    __memcpy(dst, src, w);
                    src += w;
                    dst += dstPitch;
                }
            }
        }
    }
    
    PREFETCH_FUNC(sse, SSE, SSE,, FENCE)
    PREFETCH_FUNC(mmxext, MMXEXT, SSE, EMMS, FENCEMMS)
    PREFETCH_FUNC(now, MMX, NOW, FEMMS, FEMMS)
    NOPREFETCH_FUNC(mmx, MMX, EMMS, EMMS)
    
    static void
    *kernel_memcpy(void *to, const void *from, size_t len)
    {
        return __memcpy(to, from, len);
    }
    
    static unsigned
    fastrdtsc(void)
    {
        unsigned eax;
    
        __asm__ volatile ("\t"
                          "pushl %%ebx\n\t"
                          "cpuid\n\t"
                          ".byte 0x0f, 0x31\n\t"
                          "popl %%ebx\n"
                          :"=a" (eax)
                          :"0"(0)
                          :"ecx", "edx", "cc");
    
        return eax;
    }
    
    
    static unsigned
    time_function(vidCopyFunc mf, unsigned char *buf1, unsigned char *buf2)
    {
        unsigned t, t2;
    
        t = fastrdtsc();
    
        (*mf) (buf1, buf2, BSIZA, BSIZW, BSIZH, 0);
    
        t2 = fastrdtsc();
        return ((t < t2) ? t2 - t : 0xFFFFFFFFU - (t - t2 - 1));
    }
    
    enum
    { libc = 0, kernel, sse, mmx, now, mmxext, totNum };
    
    typedef struct
    {
        vidCopyFunc mFunc;
        const char *mName, **cpuFlag;
    } McFuncData;
    
    static const char *libc_cpuflags[] = { " ", 0 };
    static const char *kernel_cpuflags[] = { " ", 0 };
    static const char *sse_cpuflags[] = { " sse ", 0 };
    static const char *mmx_cpuflags[] = { " mmx ", 0 };
    static const char *now_cpuflags[] = { " 3dnow ", 0 };
    static const char *mmx2_cpuflags[] = { " mmxext ", " sse ", 0 };
    
    static McFuncData mcFunctions[totNum] = {
    {libc_YUV42X, "libc", libc_cpuflags},
    {kernel_YUV42X, "kernel", kernel_cpuflags},
    {sse_YUV42X, "SSE", sse_cpuflags},
    {mmx_YUV42X, "MMX", mmx_cpuflags},
    {now_YUV42X, "3DNow!", now_cpuflags},
    {mmxext_YUV42X, "MMX2", mmx2_cpuflags}
    };
    
    
    static int
    flagValid(const char *cpuinfo, const char *flag)
    {
        const char *flagLoc, *nextProc;
        int located = 0;
    
        while ((cpuinfo = strstr(cpuinfo, "processor\t:"))) {
            located = 1;
            cpuinfo += 11;
            if ((flagLoc = strstr(cpuinfo, flag))) {
                if ((nextProc = strstr(cpuinfo, "processor\t:"))) {
                    if (nextProc < flagLoc)
                        return 0;
                }
            } else {
                return 0;
            }
        }
        return located;
    }
    
    
    static int
    cpuValid(const char *cpuinfo, const char **flags)
    {
        for (; *flags != 0; flags++) {
            if (flagValid(cpuinfo, *flags))
                return 1;
        }
        return 0;
    }
    
    /*
     * Benchmark the video copy routines and choose the fastest.
     */
    vidCopyFunc
    viaVidCopyInit(const char *copyType, ScreenPtr pScreen)
    {
        ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
    
        char buf[BSIZ];
        unsigned char *buf1, *buf2, *buf3;
        char *tmpBuf, *endBuf;
        int count, j, bestSoFar;
        unsigned best, tmp, testSize, alignSize, tmp2;
        struct buffer_object *tmpFbBuffer;
        McFuncData *curData;
        FILE *cpuInfoFile;
        double cpuFreq;
    
        if (NULL == (cpuInfoFile = fopen("/proc/cpuinfo", "r"))) {
            return libc_YUV42X;
        }
        count = fread(buf, 1, BSIZ, cpuInfoFile);
        if (ferror(cpuInfoFile)) {
            fclose(cpuInfoFile);
            return libc_YUV42X;
        }
        fclose(cpuInfoFile);
        if (BSIZ == count) {
            xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
                       "\"/proc/cpuinfo\" file too long. "
                       "Using Linux kernel memcpy.\n");
            return libc_YUV42X;
        }
        buf[count] = 0;
    
        while (count--)
            if ('\n' == buf[count])
                buf[count] = ' ';
    
        /* Extract the CPU frequency. */
        cpuFreq = 0.;
        if (NULL != (tmpBuf = strstr(buf, "cpu MHz"))) {
            if (NULL != (tmpBuf = strstr(tmpBuf, ":") + 1)) {
                cpuFreq = strtod(tmpBuf, &endBuf);
                if (endBuf == tmpBuf)
                    tmpBuf = NULL;
            }
        }
    
        alignSize = BSIZH * (BSIZA + (BSIZA >> 1));
        testSize = BSIZH * (BSIZW + (BSIZW >> 1));
        /*
         * Allocate an area of offscreen FB memory, (buf1), a simulated video
         * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
         */
        tmpFbBuffer = drm_bo_alloc(pScrn, alignSize, 32, TTM_PL_VRAM);
        if (!tmpFbBuffer)
            return libc_YUV42X;
        if (NULL == (buf2 = (unsigned char *)malloc(testSize))) {
            drm_bo_free(pScrn, tmpFbBuffer);
            return libc_YUV42X;
        }
        if (NULL == (buf3 = (unsigned char *)malloc(testSize))) {
            free(buf2);
            drm_bo_free(pScrn, tmpFbBuffer);
            return libc_YUV42X;
        }
        buf1 = drm_bo_map(pScrn, tmpFbBuffer);
        bestSoFar = 0;
        best = 0xFFFFFFFFU;
    
        /* Make probable that buf1 and buf2 are in memory by referencing them. */
        libc_YUV42X(buf1, buf2, BSIZA, BSIZW, BSIZH, 0);
    
        xf86DrvMsg(pScrn->scrnIndex, X_INFO,
                   "Benchmarking %s copy.  Less time is better.\n", copyType);
        for (j = 0; j < totNum; ++j) {
            curData = mcFunctions + j;
    
            if (cpuValid(buf, curData->cpuFlag)) {
    
                /* Simulate setup of the video buffer. */
                kernel_memcpy(buf2, buf3, testSize);
    
                /* Copy the video buffer to frame-buffer memory. */
                tmp = time_function(curData->mFunc, buf1, buf2);
    
                /* Do it again to avoid context-switch effects. */
                kernel_memcpy(buf2, buf3, testSize);
                tmp2 = time_function(curData->mFunc, buf1, buf2);
                tmp = (tmp2 < tmp) ? tmp2 : tmp;
    
                if (NULL == tmpBuf) {
                    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
                               "Timed %6s YUV420 copy... %u.\n",
                               curData->mName, tmp);
                } else {
                    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
                               "Timed %6s YUV420 copy... %u. "
                               "Throughput: %.1f MiB/s.\n",
                               curData->mName, tmp,
                               cpuFreq * 1.e6 * (double)testSize /
                               ((double)(tmp) * (double)(0x100000)));
                }
                if (tmp < best) {
                    best = tmp;
                    bestSoFar = j;
                }
            } else {
                xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
                           "Ditching %6s YUV420 copy. Not supported by CPU.\n",
                           curData->mName);
            }
        }
        free(buf3);
        free(buf2);
        drm_bo_unmap(pScrn, tmpFbBuffer);
        drm_bo_free(pScrn, tmpFbBuffer);
        xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
                   "Using %s YUV42X copy for %s.\n",
                   mcFunctions[bestSoFar].mName, copyType);
        return mcFunctions[bestSoFar].mFunc;
    }
    
    #else
    
    vidCopyFunc
    viaVidCopyInit(const char *copyType, ScreenPtr pScreen)
    {
        ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
    
        xf86DrvMsg(pScrn->scrnIndex, X_INFO,
                   "Using default xfree86 memcpy for video.\n");
        return libc_YUV42X;
    }
    
    #endif /* __i386__ */