Edit

kc3-lang/avir/avir_dil.h

Branch :

  • Show log

    Commit

  • Author : Aleksey Vaneev
    Date : 2015-11-10 19:23:24
    Hash : 4222213a
    Message : Fixed access violation due to a possible unaligned SIMD variable access.

  • avir_dil.h
  • //$ nobt
    //$ nocpp
    
    /**
     * @file avir_dil.h
     *
     * @brief Inclusion file for de-interleaved image resizing functions.
     *
     * This file includes the "CImageResizerFilterStepDIL" class which implements
     * image resizing functions in de-interleaved mode.
     *
     * AVIR Copyright (c) 2015 Aleksey Vaneev
     */
    
    namespace avir {
    
    /**
     * @brief De-interleaved filtering steps implementation class.
     *
     * This class implements scanline filtering functions in de-interleaved mode.
     * This means that pixels are processed in groups.
     *
     * @tparam fptype Floating point type to use for storing pixel elements.
     * SIMD types cannot be used.
     * @tparam fptypesimd The SIMD type used to store a pack of "fptype" values.
     */
    
    template< class fptype, class fptypesimd >
    class CImageResizerFilterStepDIL :
    	public CImageResizerFilterStep< fptype, fptype >
    {
    public:
    	using CImageResizerFilterStep< fptype, fptype > :: IsUpsample;
    	using CImageResizerFilterStep< fptype, fptype > :: ResampleFactor;
    	using CImageResizerFilterStep< fptype, fptype > :: Flt;
    	using CImageResizerFilterStep< fptype, fptype > :: FltOrig;
    	using CImageResizerFilterStep< fptype, fptype > :: FltLatency;
    	using CImageResizerFilterStep< fptype, fptype > :: Vars;
    	using CImageResizerFilterStep< fptype, fptype > :: InLen;
    	using CImageResizerFilterStep< fptype, fptype > :: InPrefix;
    	using CImageResizerFilterStep< fptype, fptype > :: InSuffix;
    	using CImageResizerFilterStep< fptype, fptype > :: InElIncr;
    	using CImageResizerFilterStep< fptype, fptype > :: OutLen;
    	using CImageResizerFilterStep< fptype, fptype > :: OutPrefix;
    	using CImageResizerFilterStep< fptype, fptype > :: OutSuffix;
    	using CImageResizerFilterStep< fptype, fptype > :: OutElIncr;
    	using CImageResizerFilterStep< fptype, fptype > :: PrefixDC;
    	using CImageResizerFilterStep< fptype, fptype > :: SuffixDC;
    	using CImageResizerFilterStep< fptype, fptype > :: RPosBuf;
    	using CImageResizerFilterStep< fptype, fptype > :: FltBank;
    	using CImageResizerFilterStep< fptype, fptype > :: EdgePixelCount;
    
    	/**
    	 * Function performs "packing" (de-interleaving) of a scanline and type
    	 * conversion.
    	 *
    	 * @param ip0 Input scanline, pixel elements interleaved.
    	 * @param op0 Output scanline, pixel elements are grouped, "l" elements
    	 * apart.
    	 * @param l The number of pixels to "unpack".
    	 */
    
    	template< class Tin >
    	void packScanline( const Tin* const ip0, fptype* const op0,
    		const int l ) const
    	{
    		const int ElCount = Vars -> ElCount;
    		int j;
    
    		for( j = 0; j < ElCount; j++ )
    		{
    			const Tin* ip = ip0 + j;
    			fptype* const op = op0 + j * InElIncr;
    			int i;
    
    			for( i = 0; i < l; i++ )
    			{
    				op[ i ] = (fptype) *ip;
    				ip += ElCount;
    			}
    		}
    	}
    
    	/**
    	 * Function converts vertical scanline to horizontal scanline. This
    	 * function is called by the image resizer when image is resized
    	 * vertically. This means that the vertical scanline is stored in the
    	 * same format produced by the packScanline() and maintained by other
    	 * filtering functions.
    	 *
    	 * @param ip Input vertical scanline, pixel elements are grouped, SrcLen
    	 * elements apart.
    	 * @param op Output buffer (temporary buffer used during resizing), pixel
    	 * elements are grouped, "l" elements apart.
    	 * @param SrcLen The number of pixels in the input scanline, also used to
    	 * calculate input buffer increment.
    	 * @param SrcIncr Input buffer increment to the next vertical pixel.
    	 */
    
    	void convertVtoH( const fptype* ip, fptype* op, const int SrcLen,
    		const int SrcIncr ) const
    	{
    		const int ElCount = Vars -> ElCount;
    		const int SrcElIncr = SrcIncr / ElCount;
    		const int ips1 = SrcElIncr;
    		const int ips2 = SrcElIncr * 2;
    		const int ips3 = SrcElIncr * 3;
    		const int ops1 = InElIncr;
    		const int ops2 = InElIncr * 2;
    		const int ops3 = InElIncr * 3;
    		int j;
    
    		if( ElCount == 1 )
    		{
    			for( j = 0; j < SrcLen; j++ )
    			{
    				op[ 0 ] = ip[ 0 ];
    				ip += SrcIncr;
    				op++;
    			}
    		}
    		else
    		if( ElCount == 4 )
    		{
    			for( j = 0; j < SrcLen; j++ )
    			{
    				op[ 0 ] = ip[ 0 ];
    				op[ ops1 ] = ip[ ips1 ];
    				op[ ops2 ] = ip[ ips2 ];
    				op[ ops3 ] = ip[ ips3 ];
    				ip += SrcIncr;
    				op++;
    			}
    		}
    		else
    		if( ElCount == 3 )
    		{
    			for( j = 0; j < SrcLen; j++ )
    			{
    				op[ 0 ] = ip[ 0 ];
    				op[ ops1 ] = ip[ ips1 ];
    				op[ ops2 ] = ip[ ips2 ];
    				ip += SrcIncr;
    				op++;
    			}
    		}
    		else
    		if( ElCount == 2 )
    		{
    			for( j = 0; j < SrcLen; j++ )
    			{
    				op[ 0 ] = ip[ 0 ];
    				op[ ops1 ] = ip[ ips1 ];
    				ip += SrcIncr;
    				op++;
    			}
    		}
    	}
    
    	/**
    	 * Function performs "unpacking" of a scanline and type conversion
    	 * (truncation is used when floating point is converted to integer).
    	 * The unpacking function assumes that scanline is stored in the style
    	 * produced by the packScanline() function.
    	 *
    	 * @param ip0 Input scanline, pixel elements are grouped, "l" elements
    	 * apart.
    	 * @param op0 Output scanline, pixel elements are interleaved.
    	 * @param l The number of pixels to "unpack".
    	 * @param Vars0 Image resizing-related variables. ElCount is assumed to be
    	 * equal to ElCountIO.
    	 */
    
    	template< class Tout >
    	static void unpackScanline( const fptype* const ip0, Tout* const op0,
    		const int l, const CImageResizerVars& Vars0 )
    	{
    		const int ElCount = Vars0.ElCount;
    		int j;
    
    		for( j = 0; j < ElCount; j++ )
    		{
    			const fptype* const ip = ip0 + j * l;
    			Tout* op = op0 + j;
    			int i;
    
    			for( i = 0; i < l; i++ )
    			{
    				*op = (Tout) ip[ i ];
    				op += ElCount;
    			}
    		}
    	}
    
    	/**
    	 * Function prepares input scanline buffer for *this filtering step.
    	 * Left- and right-most pixels are replicated to make sure no buffer
    	 * overrun happens. Such approach also allows to bypass any pointer
    	 * range checks.
    	 *
    	 * @param Src Source buffer.
    	 */
    
    	void prepareInBuf( fptype* Src ) const
    	{
    		if( IsUpsample || InPrefix + InSuffix == 0 )
    		{
    			return;
    		}
    
    		int j;
    
    		for( j = 0; j < Vars -> ElCount; j++ )
    		{
    			replicateArray( Src, 1, Src - InPrefix, InPrefix, 1 );
    			fptype* const Src2 = Src + InLen - 1;
    			replicateArray( Src2, 1, Src2 + 1, InSuffix, 1 );
    			Src += InElIncr;
    		}
    	}
    
    	/**
    	 * Function peforms scanline upsampling with filtering.
    	 *
    	 * @param Src Source scanline buffer (length = this -> InLen). Source
    	 * scanline increment will be equal to ElCount.
    	 * @param Dst Destination scanline buffer.
    	 */
    
    	void doUpsample( const fptype* Src, fptype* Dst ) const
    	{
    		const int elalign = Vars -> elalign;
    		const int opstep = ResampleFactor;
    		const fptype* const f = Flt;
    		const int flen = Flt.getCapacity();
    		int l;
    		int i;
    		int j;
    
    		for( j = 0; j < Vars -> ElCount; j++ )
    		{
    			const fptype* ip = Src;
    			fptype* op0 = &Dst[ -OutPrefix ];
    			memset( op0, 0, ( OutPrefix + OutLen + OutSuffix ) *
    				sizeof( fptype ));
    
    			if( FltOrig.getCapacity() > 0 )
    			{
    				// Do not perform filtering, only upsample.
    
    				op0 += OutPrefix % ResampleFactor;
    				l = OutPrefix / ResampleFactor;
    
    				while( l > 0 )
    				{
    					op0[ 0 ] = ip[ 0 ];
    					op0 += opstep;
    					l--;
    				}
    
    				l = InLen - 1;
    
    				while( l > 0 )
    				{
    					op0[ 0 ] = ip[ 0 ];
    					op0 += opstep;
    					ip++;
    					l--;
    				}
    
    				l = OutSuffix / ResampleFactor;
    
    				while( l >= 0 )
    				{
    					op0[ 0 ] = ip[ 0 ];
    					op0 += opstep;
    					l--;
    				}
    
    				Src += InElIncr;
    				Dst += OutElIncr;
    				continue;
    			}
    
    			l = InPrefix;
    			fptypesimd ipv = (fptypesimd) ip[ 0 ];
    
    			while( l > 0 )
    			{
    				for( i = 0; i < flen; i += elalign )
    				{
    					fptypesimd :: addu( op0 + i,
    						fptypesimd :: load( f + i ) * ipv );
    				}
    
    				op0 += opstep;
    				l--;
    			}
    
    			l = InLen - 1;
    
    			while( l > 0 )
    			{
    				ipv = (fptypesimd) ip[ 0 ];
    
    				for( i = 0; i < flen; i += elalign )
    				{
    					fptypesimd :: addu( op0 + i,
    						fptypesimd :: load( f + i ) * ipv );
    				}
    
    				ip++;
    				op0 += opstep;
    				l--;
    			}
    
    			l = InSuffix;
    			ipv = (fptypesimd) ip[ 0 ];
    
    			while( l >= 0 )
    			{
    				for( i = 0; i < flen; i += elalign )
    				{
    					fptypesimd :: addu( op0 + i,
    						fptypesimd :: load( f + i ) * ipv );
    				}
    
    				op0 += opstep;
    				l--;
    			}
    
    			const fptype* dc = SuffixDC;
    			l = SuffixDC.getCapacity();
    
    			for( i = 0; i < l; i += elalign )
    			{
    				fptypesimd :: addu( op0 + i,
    					fptypesimd :: load( dc + i ) * ipv );
    			}
    
    			ipv = (fptypesimd) Src[ 0 ];
    			op0 = Dst - InPrefix * opstep;
    			dc = PrefixDC;
    			l = PrefixDC.getCapacity();
    
    			for( i = 0; i < l; i += elalign )
    			{
    				fptypesimd :: addu( op0 + i,
    					fptypesimd :: load( dc + i ) * ipv );
    			}
    
    			Src += InElIncr;
    			Dst += OutElIncr;
    		}
    	}
    
    	/**
    	 * Function peforms scanline filtering with optional downsampling.
    	 * Function makes use of the symmetry of the filter.
    	 *
    	 * @param Src Source scanline buffer (length = this -> InLen). Source
    	 * scanline increment will be equal to 1.
    	 * @param Dst Destination scanline buffer.
    	 * @param DstIncr Destination scanline buffer increment, used for
    	 * horizontal or vertical scanline stepping.
    	 */
    
    	void doFilter( const fptype* const Src, fptype* Dst,
    		const int DstIncr ) const
    	{
    		const int ElCount = Vars -> ElCount;
    		const int elalign = Vars -> elalign;
    		const fptype* const f = &Flt[ 0 ];
    		const int flen = Flt.getCapacity();
    		const int ipstep = ResampleFactor;
    		int i;
    		int j;
    
    		if( ElCount == 1 )
    		{
    			const fptype* ip = Src - EdgePixelCount * ipstep - FltLatency;
    			fptype* op = Dst;
    			int l = OutLen;
    
    			while( l > 0 )
    			{
    				fptypesimd s = fptypesimd :: load( f ) *
    					fptypesimd :: loadu( ip );
    
    				for( i = elalign; i < flen; i += elalign )
    				{
    					s += fptypesimd :: load( f + i ) *
    						fptypesimd :: loadu( ip + i );
    				}
    
    				op[ 0 ] = s.hadd();
    				op += DstIncr;
    				ip += ipstep;
    				l--;
    			}
    		}
    		else
    		if( DstIncr == 1 )
    		{
    			for( j = 0; j < ElCount; j++ )
    			{
    				const fptype* ip = Src - EdgePixelCount * ipstep -
    					FltLatency + j * InElIncr;
    
    				fptype* op = Dst + j * OutElIncr;
    				int l = OutLen;
    
    				while( l > 0 )
    				{
    					fptypesimd s = fptypesimd :: load( f ) *
    						fptypesimd :: loadu( ip );
    
    					for( i = elalign; i < flen; i += elalign )
    					{
    						s += fptypesimd :: load( f + i ) *
    							fptypesimd :: loadu( ip + i );
    					}
    
    					op[ 0 ] = s.hadd();
    					op += DstIncr;
    					ip += ipstep;
    					l--;
    				}
    			}
    		}
    		else
    		{
    			const fptype* ip0 = Src - EdgePixelCount * ipstep - FltLatency;
    			fptype* op0 = Dst;
    			int l = OutLen;
    
    			while( l > 0 )
    			{
    				const fptype* ip = ip0;
    				fptype* op = op0;
    
    				for( j = 0; j < ElCount; j++ )
    				{
    					fptypesimd s = fptypesimd :: load( f ) *
    						fptypesimd :: loadu( ip );
    
    					for( i = elalign; i < flen; i += elalign )
    					{
    						s += fptypesimd :: load( f + i ) *
    							fptypesimd :: loadu( ip + i );
    					}
    
    					op[ 0 ] = s.hadd();
    					ip += InElIncr;
    					op += OutElIncr;
    				}
    
    				ip0 += ipstep;
    				op0 += DstIncr;
    				l--;
    			}
    		}
    	}
    
    	/**
    	 * Function performs resizing of a single scanline. This function does
    	 * not "know" about the length of the source scanline buffer. This buffer
    	 * should be padded with enough pixels so that ( SrcPos - FilterLenD2 ) is
    	 * always >= 0 and ( SrcPos + ( DstLineLen - 1 ) * k + FilterLenD2 + 1 )
    	 * does not exceed source scanline's buffer length. SrcLine's increment is
    	 * assumed to be equal to 1.
    	 *
    	 * @param SrcLine Source scanline buffer.
    	 * @param DstLine Destination (resized) scanline buffer.
    	 * @param DstLineIncr Destination scanline position increment, used for
    	 * horizontal or vertical scanline stepping.
    	 */
    
    	void doResize( const fptype* SrcLine, fptype* DstLine,
    		int DstLineIncr ) const
    	{
    		const int IntFltLen = FltBank -> getFilterLen();
    		const int ElCount = Vars -> ElCount;
    		const int elalign = Vars -> elalign;
    		const typename CImageResizerFilterStep< fptype, fptype > ::
    			CResizePos* rpos = &(*RPosBuf)[ 0 ];
    
    		int DstLineLen = OutLen;
    		int i;
    		int j;
    
    #define AVIR_RESIZE_PART1 \
    			while( DstLineLen > 0 ) \
    			{ \
    				const fptypesimd x = (fptypesimd) rpos -> x; \
    				const fptype* ftp = rpos -> ftp; \
    				const fptype* ftp2 = rpos -> ftp + IntFltLen; \
    				const fptype* Src = SrcLine + rpos -> SrcOffs;
    
    #define AVIR_RESIZE_PART1nx \
    			while( DstLineLen > 0 ) \
    			{ \
    				const fptype* ftp = rpos -> ftp; \
    				const fptype* Src = SrcLine + rpos -> SrcOffs;
    
    #define AVIR_RESIZE_PART2 \
    				DstLine += DstLineIncr; \
    				rpos++; \
    				DstLineLen--; \
    			}
    
    		if( ElCount == 1 )
    		{
    			if( FltBank -> getOrder() == 1 )
    			{
    				AVIR_RESIZE_PART1
    
    				fptypesimd sum = ( fptypesimd :: load( ftp ) +
    					fptypesimd :: load( ftp2 ) * x ) *
    					fptypesimd :: loadu( Src );
    
    				for( i = elalign; i < IntFltLen; i += elalign )
    				{
    					sum += ( fptypesimd :: load( ftp + i ) +
    						fptypesimd :: load( ftp2 + i ) * x ) *
    						fptypesimd :: loadu( Src + i );
    				}
    
    				DstLine[ 0 ] = sum.hadd();
    
    				AVIR_RESIZE_PART2
    			}
    			else
    			{
    				AVIR_RESIZE_PART1nx
    
    				fptypesimd sum = fptypesimd :: load( ftp ) *
    					fptypesimd :: loadu( Src );
    
    				for( i = elalign; i < IntFltLen; i += elalign )
    				{
    					sum += fptypesimd :: load( ftp + i ) *
    						fptypesimd :: loadu( Src + i );
    				}
    
    				DstLine[ 0 ] = sum.hadd();
    
    				AVIR_RESIZE_PART2
    			}
    		}
    		else
    		if( DstLineIncr == 1 )
    		{
    			// Horizontal-oriented processing, element loop is outer.
    
    			const int SrcIncr = InElIncr;
    			const int DstLineElIncr = OutElIncr - DstLineIncr * DstLineLen;
    
    			if( FltBank -> getOrder() == 1 )
    			{
    				for( j = 0; j < ElCount; j++ )
    				{
    					AVIR_RESIZE_PART1
    
    					fptype xx[ IntFltLen ];
    
    					for( i = 0; i < IntFltLen; i += elalign )
    					{
    						( fptypesimd :: load( ftp + i ) +
    							fptypesimd :: load( ftp2 + i ) * x ).storeu(
    							xx + i );
    					}
    
    					fptypesimd sum = fptypesimd :: loadu( xx ) *
    						fptypesimd :: loadu( Src );
    
    					for( i = elalign; i < IntFltLen; i += elalign )
    					{
    						sum += fptypesimd :: loadu( xx + i ) *
    							fptypesimd :: loadu( Src + i );
    					}
    
    					DstLine[ 0 ] = sum.hadd();
    
    					AVIR_RESIZE_PART2
    
    					DstLine += DstLineElIncr;
    					SrcLine += SrcIncr;
    					DstLineLen = OutLen;
    					rpos = &(*RPosBuf)[ 0 ];
    				}
    			}
    			else
    			{
    				for( j = 0; j < ElCount; j++ )
    				{
    					AVIR_RESIZE_PART1nx
    
    					fptypesimd sum = fptypesimd :: load( ftp ) *
    						fptypesimd :: loadu( Src );
    
    					for( i = elalign; i < IntFltLen; i += elalign )
    					{
    						sum += fptypesimd :: load( ftp + i ) *
    							fptypesimd :: loadu( Src + i );
    					}
    
    					DstLine[ 0 ] = sum.hadd();
    
    					AVIR_RESIZE_PART2
    
    					DstLine += DstLineElIncr;
    					SrcLine += SrcIncr;
    					DstLineLen = OutLen;
    					rpos = &(*RPosBuf)[ 0 ];
    				}
    			}
    		}
    		else
    		{
    			const int SrcIncr = InElIncr;
    			const int DstLineElIncr = OutElIncr;
    			DstLineIncr -= DstLineElIncr * ElCount;
    
    			if( FltBank -> getOrder() == 1 )
    			{
    				AVIR_RESIZE_PART1
    
    				fptype xx[ IntFltLen ];
    
    				for( i = 0; i < IntFltLen; i += elalign )
    				{
    					( fptypesimd :: load( ftp + i ) +
    						fptypesimd :: load( ftp2 + i ) * x ).storeu( xx + i );
    				}
    
    				for( j = 0; j < ElCount; j++ )
    				{
    					fptypesimd sum = fptypesimd :: loadu( xx ) *
    						fptypesimd :: loadu( Src );
    
    					for( i = elalign; i < IntFltLen; i += elalign )
    					{
    						sum += fptypesimd :: loadu( xx + i ) *
    							fptypesimd :: loadu( Src + i );
    					}
    
    					DstLine[ 0 ] = sum.hadd();
    					DstLine += DstLineElIncr;
    					Src += SrcIncr;
    				}
    
    				AVIR_RESIZE_PART2
    			}
    			else
    			{
    				AVIR_RESIZE_PART1nx
    
    				for( j = 0; j < ElCount; j++ )
    				{
    					fptypesimd sum = fptypesimd :: load( ftp ) *
    						fptypesimd :: loadu( Src );
    
    					for( i = elalign; i < IntFltLen; i += elalign )
    					{
    						sum += fptypesimd :: load( ftp + i ) *
    							fptypesimd :: loadu( Src + i );
    					}
    
    					DstLine[ 0 ] = sum.hadd();
    					DstLine += DstLineElIncr;
    					Src += SrcIncr;
    				}
    
    				AVIR_RESIZE_PART2
    			}
    		}
    
    #undef AVIR_RESIZE_PART2
    #undef AVIR_RESIZE_PART1nx
    #undef AVIR_RESIZE_PART1
    	}
    };
    
    /**
     * @brief Image resizer's default de-interleaved dithering class.
     *
     * This class defines an object that performs rounding, clipping and dithering
     * operations over horizontal scanline pixels before scanline is stored in the
     * output buffer.
     *
     * This ditherer implementation uses de-interlaved SIMD algorithm.
     *
     * @tparam fptype Floating point type to use for storing pixel data. SIMD
     * types cannot be used.
     * @tparam fptypesimd The SIMD type used to store a pack of "fptype" values.
     */
    
    template< class fptype, class fptypesimd >
    class CImageResizerDithererDefDIL
    {
    public:
    	/**
    	 * Function initializes the ditherer object.
    	 *
    	 * @param aLen Scanline length in pixels to process.
    	 * @param aVars Image resizing-related variables.
    	 * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
    	 * truncation.
    	 * @param aPkOut Peak output value allowed.
    	 */
    
    	void init( const int aLen, const CImageResizerVars& aVars,
    		const double aTrMul, const double aPkOut )
    	{
    		Len = aLen;
    		Vars = &aVars;
    		LenE = aLen * Vars -> ElCount;
    		TrMul0 = aTrMul;
    		PkOut0 = aPkOut;
    	}
    
    	/**
    	 * @return "True" if dithering is recursive relative to scanlines meaning
    	 * multi-threaded execution is not supported by this dithering method.
    	 */
    
    	static bool isRecursive()
    	{
    		return( false );
    	}
    
    	/**
    	 * Function performs rounding and clipping operations.
    	 *
    	 * @param ResScanline The buffer containing the final scanline.
    	 */
    
    	void dither( fptype* const ResScanline ) const
    	{
    		const int elalign = Vars -> elalign;
    		const fptypesimd c0 = 0.0;
    		const fptypesimd TrMul = (fptypesimd) TrMul0;
    		const fptypesimd PkOut = (fptypesimd) PkOut0;
    		int j;
    
    		for( j = 0; j < LenE - elalign; j += elalign )
    		{
    			const fptypesimd z0 = round(
    				fptypesimd :: loadu( ResScanline + j ) / TrMul ) * TrMul;
    
    			clamp( z0, c0, PkOut ).storeu( ResScanline + j );
    		}
    
    		const int lim = LenE - j;
    		const fptypesimd z0 = round(
    			fptypesimd :: loadu( ResScanline + j, lim ) / TrMul ) * TrMul;
    
    		clamp( z0, c0, PkOut ).storeu( ResScanline + j, lim );
    	}
    
    protected:
    	int Len; ///< Scanline's length in pixels.
    		///<
    	const CImageResizerVars* Vars; ///< Image resizing-related variables.
    		///<
    	int LenE; ///< = LenE * ElCount.
    		///<
    	double TrMul0; ///< Bit-depth truncation multiplier.
    		///<
    	double PkOut0; ///< Peak output value allowed.
    		///<
    };
    
    /**
     * @brief Image resizer's quasi-random dithering class, de-interleaved mode.
     *
     * This ditherer implements a classic quasi-random dithering which looks OK
     * and whose results are compressed by PNG well.
     *
     * @tparam fptype Floating point type to use for storing pixel data. SIMD
     * types can be used.
     */
    
    template< class fptype, class fptypesimd >
    class CImageResizerDithererQRndDIL
    {
    public:
    	/**
    	 * Function initializes the ditherer object.
    	 *
    	 * @param aLen Scanline length in pixels to process.
    	 * @param aVars Image resizing-related variables.
    	 * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
    	 * truncation.
    	 * @param aPkOut Peak output value allowed.
    	 */
    
    	void init( const int aLen, const CImageResizerVars& aVars,
    		const double aTrMul, const double aPkOut )
    	{
    		Len = aLen;
    		Vars = &aVars;
    		LenE = aLen * Vars -> ElCount;
    		TrMul0 = aTrMul;
    		PkOut0 = aPkOut;
    
    		ResScanlineDith0.alloc( LenE + Vars -> ElCount, sizeof( fptype ));
    		ResScanlineDith = ResScanlineDith0 + Vars -> ElCount;
    		int i;
    
    		for( i = 0; i < LenE + Vars -> ElCount; i++ )
    		{
    			ResScanlineDith0[ i ] = 0.0;
    		}
    	}
    
    	static bool isRecursive()
    	{
    		return( true );
    	}
    
    	void dither( fptype* const ResScanline )
    	{
    		const int ea = Vars -> elalign;
    		const fptypesimd c0 = 0.0;
    		const fptypesimd TrMul = (fptypesimd) TrMul0;
    		const fptypesimd PkOut = (fptypesimd) PkOut0;
    		int j;
    
    		for( j = 0; j < LenE - ea; j += ea )
    		{
    			fptypesimd :: addu( ResScanline + j,
    				fptypesimd :: loadu( ResScanlineDith + j ));
    
    			c0.storeu( ResScanlineDith + j );
    		}
    
    		int lim = LenE - j;
    		fptypesimd :: addu( ResScanline + j,
    			fptypesimd :: loadu( ResScanlineDith + j, lim ), lim );
    
    		c0.storeu( ResScanlineDith + j, lim );
    
    		const int Len1 = Len - 1;
    		fptype* rs = ResScanline;
    		fptype* rsd = ResScanlineDith;
    		int i;
    
    		for( i = 0; i < Vars -> ElCount; i++ )
    		{
    			for( j = 0; j < Len1; j++ )
    			{
    				// Perform rounding, noise estimation and saturation.
    
    				fptype* const rsj = rs + j;
    				const fptype z0 = round( rsj[ 0 ] / TrMul ) * TrMul;
    				const fptype Noise = rsj[ 0 ] - z0;
    				rsj[ 0 ] = clamp( z0, (fptype) 0.0, PkOut );
    
    				fptype* const rsdj = rsd + j;
    				rsj[ 1 ] += Noise * (fptype) 0.4375;
    				rsdj[ 1 ] += Noise * (fptype) 0.0625;
    				rsdj[ 0 ] += Noise * (fptype) 0.3125;
    				rsdj[ -1 ] += Noise * (fptype) 0.1875;
    			}
    
    			// Process the last pixel element in scanline.
    
    			const fptype z1 = round( rs[ Len1 ] / TrMul ) * TrMul;
    			const fptype Noise2 = rs[ Len1 ] - z1;
    			rs[ Len1 ] = clamp( z1, c0, PkOut );
    
    			rsd[ Len1 ] += Noise2 * (fptype) 0.3125;
    			rsd[ Len1 - 1 ] += Noise2 * (fptype) 0.1875;
    
    			rs += Len;
    			rsd += Len;
    		}
    	}
    
    protected:
    	int Len; ///< Scanline's length in pixels.
    		///<
    	const CImageResizerVars* Vars; ///< Image resizing-related variables.
    		///<
    	int LenE; ///< = LenE * ElCount.
    		///<
    	double TrMul0; ///< Bit-depth truncation multiplier.
    		///<
    	double PkOut0; ///< Peak output value allowed.
    		///<
    	CBuffer< fptype > ResScanlineDith0; ///< Error propagation buffer for
    		///< dithering, first pixel unused.
    		///<
    	fptype* ResScanlineDith; ///< Error propagation buffer pointer which skips
    		///< the first ElCount elements.
    		///<
    };
    
    /**
     * @brief Floating-point processing definition and abstraction class for
     * de-interleaved processing.
     *
     * This class defines several constants and typedefs that point to classes
     * that should be used by the image resizing algorithm. This implementation
     * points to de-interleaved processing classes.
     *
     * @tparam afptype Floating point type to use for storing intermediate data
     * and variables. SIMD types should not be used.
     * @tparam afptypesimd SIMD type used to perform processing.
     */
    
    template< class afptype, class afptypesimd >
    class fpclass_def_dil
    {
    public:
    	typedef afptype fptype; ///< Floating-point type to use during processing.
    		///<
    	typedef afptype fptypeatom; ///< Atomic type "fptype" consists of.
    		///<
    	static const int fppack = 1; ///< The number of atomic types stored in a
    		///< single "fptype" element.
    		///<
    	static const int fpalign = sizeof( afptypesimd ); ///< Suggested alignment
    		///< size in bytes. This is not a required alignment, because image
    		///< resizing algorithm cannot be made to have a strictly aligned data
    		///< access in all cases (e.g. de-interlaved interpolation cannot
    		///< perform aligned accesses).
    		///<
    	static const int elalign = sizeof( afptypesimd ) / sizeof( afptype ); ///<
    		///< Length alignment of arrays of elements. This applies to filters
    		///< and intermediate buffers: this constant forces filters and
    		///< scanlines to have a length which is a multiple of this value, for
    		///< more efficient SIMD implementation. Value different to 1 also
    		///< means image pixels are de-interleaved during processing.
    		///<
    	typedef CImageResizerFilterStepDIL< fptype, afptypesimd > CFilterStep; ///<
    		///< Filtering step class to use during processing.
    		///<
    	typedef CImageResizerDithererQRndDIL< fptype,
    		afptypesimd > CDitherer; ///< Ditherer class to use during processing.
    		///<
    	typedef fpclass_reset< afptypesimd > CReset; ///< Floating-point
    		///< processing reset implementation class.
    		///<
    };
    
    } // namespace avir