OB-Xd/Modules/gin/3rdparty/avir/avir_dil.h

//$ nobt
//$ nocpp

/**
 * @file avir_dil.h
 *
 * @brief Inclusion file for de-interleaved image resizing functions.
 *
 * This file includes the "CImageResizerFilterStepDIL" class which implements
 * image resizing functions in de-interleaved mode.
 *
 * AVIR Copyright (c) 2015-2019 Aleksey Vaneev
 */

namespace avir {

/**
 * @brief De-interleaved filtering steps implementation class.
 *
 * This class implements scanline filtering functions in de-interleaved mode.
 * This means that pixels are processed in groups.
 *
 * @tparam fptype Floating point type to use for storing pixel elements.
 * SIMD types cannot be used.
 * @tparam fptypesimd The SIMD type used to store a pack of "fptype" values.
 */

template< class fptype, class fptypesimd >
class CImageResizerFilterStepDIL :
    public CImageResizerFilterStep< fptype, fptype >
{
public:
    using CImageResizerFilterStep< fptype, fptype > :: IsUpsample;
    using CImageResizerFilterStep< fptype, fptype > :: ResampleFactor;
    using CImageResizerFilterStep< fptype, fptype > :: Flt;
    using CImageResizerFilterStep< fptype, fptype > :: FltOrig;
    using CImageResizerFilterStep< fptype, fptype > :: FltLatency;
    using CImageResizerFilterStep< fptype, fptype > :: Vars;
    using CImageResizerFilterStep< fptype, fptype > :: InLen;
    using CImageResizerFilterStep< fptype, fptype > :: InPrefix;
    using CImageResizerFilterStep< fptype, fptype > :: InSuffix;
    using CImageResizerFilterStep< fptype, fptype > :: InElIncr;
    using CImageResizerFilterStep< fptype, fptype > :: OutLen;
    using CImageResizerFilterStep< fptype, fptype > :: OutPrefix;
    using CImageResizerFilterStep< fptype, fptype > :: OutSuffix;
    using CImageResizerFilterStep< fptype, fptype > :: OutElIncr;
    using CImageResizerFilterStep< fptype, fptype > :: PrefixDC;
    using CImageResizerFilterStep< fptype, fptype > :: SuffixDC;
    using CImageResizerFilterStep< fptype, fptype > :: RPosBuf;
    using CImageResizerFilterStep< fptype, fptype > :: FltBank;
    using CImageResizerFilterStep< fptype, fptype > :: EdgePixelCount;

    /**
     * Function performs "packing" (de-interleaving) of a scanline and type
     * conversion. If required, the sRGB gamma correction is applied.
     *
     * @param ip0 Input scanline, pixel elements interleaved.
     * @param op0 Output scanline, pixel elements are grouped, "l" elements
     * apart.
     * @param l The number of pixels to "pack".
     */

    template< class Tin >
    void packScanline( const Tin* const ip0, fptype* const op0,
        const int l ) const
    {
        const int ElCount = Vars -> ElCount;
        int j;

        if( !Vars -> UseSRGBGamma )
        {
            for( j = 0; j < ElCount; j++ )
            {
                const Tin* ip = ip0 + j;
                fptype* const op = op0 + j * InElIncr;
                int i;

                for( i = 0; i < l; i++ )
                {
                    op[ i ] = (fptype) *ip;
                    ip += ElCount;
                }
            }
        }
        else
        {
            const fptype gm = (fptype) Vars -> InGammaMult;

            for( j = 0; j < ElCount; j++ )
            {
                const Tin* ip = ip0 + j;
                fptype* const op = op0 + j * InElIncr;
                int i;

                for( i = 0; i < l; i++ )
                {
                    op[ i ] = convertSRGB2Lin( (fptype) *ip * gm );
                    ip += ElCount;
                }
            }
        }
    }

    /**
     * Function applies Linear to sRGB gamma correction to the specified
     * scanline.
     *
     * @param p Scanline.
     * @param l The number of pixels to de-linearize.
     * @param Vars0 Image resizing-related variables.
     */

    static void applySRGBGamma( fptype* const p0, const int l,
        const CImageResizerVars& Vars0 )
    {
        const int ElCount = Vars0.ElCount;
        const fptype gm = (fptype) Vars0.OutGammaMult;
        int j;

        for( j = 0; j < ElCount; j++ )
        {
            fptype* const p = p0 + j * l;
            int i;

            for( i = 0; i < l; i++ )
            {
                p[ i ] = convertLin2SRGB( p[ i ]) * gm;
            }
        }
    }

    /**
     * Function converts vertical scanline to horizontal scanline. This
     * function is called by the image resizer when image is resized
     * vertically. This means that the vertical scanline is stored in the
     * same format produced by the packScanline() and maintained by other
     * filtering functions.
     *
     * @param ip Input vertical scanline, pixel elements are grouped, SrcLen
     * elements apart.
     * @param op Output buffer (temporary buffer used during resizing), pixel
     * elements are grouped, "l" elements apart.
     * @param SrcLen The number of pixels in the input scanline, also used to
     * calculate input buffer increment.
     * @param SrcIncr Input buffer increment to the next vertical pixel.
     */

    void convertVtoH( const fptype* ip, fptype* op, const int SrcLen,
        const int SrcIncr ) const
    {
        const int ElCount = Vars -> ElCount;
        const int SrcElIncr = SrcIncr / ElCount;
        const int ips1 = SrcElIncr;
        const int ips2 = SrcElIncr * 2;
        const int ips3 = SrcElIncr * 3;
        const int ops1 = InElIncr;
        const int ops2 = InElIncr * 2;
        const int ops3 = InElIncr * 3;
        int j;

        if( ElCount == 1 )
        {
            for( j = 0; j < SrcLen; j++ )
            {
                op[ 0 ] = ip[ 0 ];
                ip += SrcIncr;
                op++;
            }
        }
        else
        if( ElCount == 4 )
        {
            for( j = 0; j < SrcLen; j++ )
            {
                op[ 0 ] = ip[ 0 ];
                op[ ops1 ] = ip[ ips1 ];
                op[ ops2 ] = ip[ ips2 ];
                op[ ops3 ] = ip[ ips3 ];
                ip += SrcIncr;
                op++;
            }
        }
        else
        if( ElCount == 3 )
        {
            for( j = 0; j < SrcLen; j++ )
            {
                op[ 0 ] = ip[ 0 ];
                op[ ops1 ] = ip[ ips1 ];
                op[ ops2 ] = ip[ ips2 ];
                ip += SrcIncr;
                op++;
            }
        }
        else
        if( ElCount == 2 )
        {
            for( j = 0; j < SrcLen; j++ )
            {
                op[ 0 ] = ip[ 0 ];
                op[ ops1 ] = ip[ ips1 ];
                ip += SrcIncr;
                op++;
            }
        }
    }

    /**
     * Function performs "unpacking" of a scanline and type conversion
     * (truncation is used when floating point is converted to integer).
     * The unpacking function assumes that scanline is stored in the style
     * produced by the packScanline() function.
     *
     * @param ip0 Input scanline, pixel elements are grouped, "l" elements
     * apart.
     * @param op0 Output scanline, pixel elements are interleaved.
     * @param l The number of pixels to "unpack".
     * @param Vars0 Image resizing-related variables. ElCount is assumed to be
     * equal to ElCountIO.
     */

    template< class Tout >
    static void unpackScanline( const fptype* const ip0, Tout* const op0,
        const int l, const CImageResizerVars& Vars0 )
    {
        const int ElCount = Vars0.ElCount;
        int j;

        for( j = 0; j < ElCount; j++ )
        {
            const fptype* const ip = ip0 + j * l;
            Tout* op = op0 + j;
            int i;

            for( i = 0; i < l; i++ )
            {
                *op = (Tout) ip[ i ];
                op += ElCount;
            }
        }
    }

    /**
     * Function prepares input scanline buffer for *this filtering step.
     * Left- and right-most pixels are replicated to make sure no buffer
     * overrun happens. Such approach also allows to bypass any pointer
     * range checks.
     *
     * @param Src Source buffer.
     */

    void prepareInBuf( fptype* Src ) const
    {
        if( IsUpsample || InPrefix + InSuffix == 0 )
        {
            return;
        }

        int j;

        for( j = 0; j < Vars -> ElCount; j++ )
        {
            replicateArray( Src, 1, Src - InPrefix, InPrefix, 1 );
            fptype* const Src2 = Src + InLen - 1;
            replicateArray( Src2, 1, Src2 + 1, InSuffix, 1 );
            Src += InElIncr;
        }
    }

    /**
     * Function peforms scanline upsampling with filtering.
     *
     * @param Src Source scanline buffer (length = this -> InLen). Source
     * scanline increment will be equal to ElCount.
     * @param Dst Destination scanline buffer.
     */

    void doUpsample( const fptype* Src, fptype* Dst ) const
    {
        const int elalign = Vars -> elalign;
        const int opstep = ResampleFactor;
        const fptype* const f = Flt;
        const int flen = Flt.getCapacity();
        int l;
        int i;
        int j;

        for( j = 0; j < Vars -> ElCount; j++ )
        {
            const fptype* ip = Src;
            fptype* op0 = &Dst[ -OutPrefix ];
            memset( op0, 0, ( OutPrefix + OutLen + OutSuffix ) *
                sizeof( fptype ));

            if( FltOrig.getCapacity() > 0 )
            {
                // Do not perform filtering, only upsample.

                op0 += OutPrefix % ResampleFactor;
                l = OutPrefix / ResampleFactor;

                while( l > 0 )
                {
                    op0[ 0 ] = ip[ 0 ];
                    op0 += opstep;
                    l--;
                }

                l = InLen - 1;

                while( l > 0 )
                {
                    op0[ 0 ] = ip[ 0 ];
                    op0 += opstep;
                    ip++;
                    l--;
                }

                l = OutSuffix / ResampleFactor;

                while( l >= 0 )
                {
                    op0[ 0 ] = ip[ 0 ];
                    op0 += opstep;
                    l--;
                }

                Src += InElIncr;
                Dst += OutElIncr;
                continue;
            }

            l = InPrefix;
            fptypesimd ipv = (fptypesimd) ip[ 0 ];

            while( l > 0 )
            {
                for( i = 0; i < flen; i += elalign )
                {
                    fptypesimd :: addu( op0 + i,
                        fptypesimd :: load( f + i ) * ipv );
                }

                op0 += opstep;
                l--;
            }

            l = InLen - 1;

            while( l > 0 )
            {
                ipv = (fptypesimd) ip[ 0 ];

                for( i = 0; i < flen; i += elalign )
                {
                    fptypesimd :: addu( op0 + i,
                        fptypesimd :: load( f + i ) * ipv );
                }

                ip++;
                op0 += opstep;
                l--;
            }

            l = InSuffix;
            ipv = (fptypesimd) ip[ 0 ];

            while( l >= 0 )
            {
                for( i = 0; i < flen; i += elalign )
                {
                    fptypesimd :: addu( op0 + i,
                        fptypesimd :: load( f + i ) * ipv );
                }

                op0 += opstep;
                l--;
            }

            const fptype* dc = SuffixDC;
            l = SuffixDC.getCapacity();

            for( i = 0; i < l; i += elalign )
            {
                fptypesimd :: addu( op0 + i,
                    fptypesimd :: load( dc + i ) * ipv );
            }

            ipv = (fptypesimd) Src[ 0 ];
            op0 = Dst - InPrefix * opstep;
            dc = PrefixDC;
            l = PrefixDC.getCapacity();

            for( i = 0; i < l; i += elalign )
            {
                fptypesimd :: addu( op0 + i,
                    fptypesimd :: load( dc + i ) * ipv );
            }

            Src += InElIncr;
            Dst += OutElIncr;
        }
    }

    /**
     * Function peforms scanline filtering with optional downsampling.
     * Function makes use of the symmetry of the filter.
     *
     * @param Src Source scanline buffer (length = this -> InLen). Source
     * scanline increment will be equal to 1.
     * @param Dst Destination scanline buffer.
     * @param DstIncr Destination scanline buffer increment, used for
     * horizontal or vertical scanline stepping.
     */

    void doFilter( const fptype* const Src, fptype* Dst,
        const int DstIncr ) const
    {
        const int ElCount = Vars -> ElCount;
        const int elalign = Vars -> elalign;
        const fptype* const f = &Flt[ 0 ];
        const int flen = Flt.getCapacity();
        const int ipstep = ResampleFactor;
        int i;
        int j;

        if( ElCount == 1 )
        {
            const fptype* ip = Src - EdgePixelCount * ipstep - FltLatency;
            fptype* op = Dst;
            int l = OutLen;

            while( l > 0 )
            {
                fptypesimd s = fptypesimd :: load( f ) *
                    fptypesimd :: loadu( ip );

                for( i = elalign; i < flen; i += elalign )
                {
                    s += fptypesimd :: load( f + i ) *
                        fptypesimd :: loadu( ip + i );
                }

                op[ 0 ] = s.hadd();
                op += DstIncr;
                ip += ipstep;
                l--;
            }
        }
        else
        if( DstIncr == 1 )
        {
            for( j = 0; j < ElCount; j++ )
            {
                const fptype* ip = Src - EdgePixelCount * ipstep -
                    FltLatency + j * InElIncr;

                fptype* op = Dst + j * OutElIncr;
                int l = OutLen;

                while( l > 0 )
                {
                    fptypesimd s = fptypesimd :: load( f ) *
                        fptypesimd :: loadu( ip );

                    for( i = elalign; i < flen; i += elalign )
                    {
                        s += fptypesimd :: load( f + i ) *
                            fptypesimd :: loadu( ip + i );
                    }

                    op[ 0 ] = s.hadd();
                    op += DstIncr;
                    ip += ipstep;
                    l--;
                }
            }
        }
        else
        {
            const fptype* ip0 = Src - EdgePixelCount * ipstep - FltLatency;
            fptype* op0 = Dst;
            int l = OutLen;

            while( l > 0 )
            {
                const fptype* ip = ip0;
                fptype* op = op0;

                for( j = 0; j < ElCount; j++ )
                {
                    fptypesimd s = fptypesimd :: load( f ) *
                        fptypesimd :: loadu( ip );

                    for( i = elalign; i < flen; i += elalign )
                    {
                        s += fptypesimd :: load( f + i ) *
                            fptypesimd :: loadu( ip + i );
                    }

                    op[ 0 ] = s.hadd();
                    ip += InElIncr;
                    op += OutElIncr;
                }

                ip0 += ipstep;
                op0 += DstIncr;
                l--;
            }
        }
    }

    /**
     * Function performs resizing of a single scanline. This function does
     * not "know" about the length of the source scanline buffer. This buffer
     * should be padded with enough pixels so that ( SrcPos - FilterLenD2 ) is
     * always >= 0 and ( SrcPos + ( DstLineLen - 1 ) * k + FilterLenD2 + 1 )
     * does not exceed source scanline's buffer length. SrcLine's increment is
     * assumed to be equal to 1.
     *
     * @param SrcLine Source scanline buffer.
     * @param DstLine Destination (resized) scanline buffer.
     * @param DstLineIncr Destination scanline position increment, used for
     * horizontal or vertical scanline stepping.
     * @param xx Temporary buffer, of size FltBank -> getFilterLen(), must be
     * aligned by fpclass :: fpalign.
     */

    void doResize( const fptype* SrcLine, fptype* DstLine,
        int DstLineIncr, fptype* const xx ) const
    {
        const int IntFltLen = FltBank -> getFilterLen();
        const int ElCount = Vars -> ElCount;
        const int elalign = Vars -> elalign;
        const typename CImageResizerFilterStep< fptype, fptype > ::
            CResizePos* rpos = &(*RPosBuf)[ 0 ];

        int DstLineLen = OutLen;
        int i;
        int j;

#define AVIR_RESIZE_PART1 \
            while( DstLineLen > 0 ) \
            { \
                const fptypesimd x = (fptypesimd) rpos -> x; \
                const fptype* ftp = rpos -> ftp; \
                const fptype* ftp2 = rpos -> ftp + IntFltLen; \
                const fptype* Src = SrcLine + rpos -> SrcOffs;

#define AVIR_RESIZE_PART1nx \
            while( DstLineLen > 0 ) \
            { \
                const fptype* ftp = rpos -> ftp; \
                const fptype* Src = SrcLine + rpos -> SrcOffs;

#define AVIR_RESIZE_PART2 \
                DstLine += DstLineIncr; \
                rpos++; \
                DstLineLen--; \
            }

        if( ElCount == 1 )
        {
            if( FltBank -> getOrder() == 1 )
            {
                AVIR_RESIZE_PART1

                fptypesimd sum = ( fptypesimd :: load( ftp ) +
                    fptypesimd :: load( ftp2 ) * x ) *
                    fptypesimd :: loadu( Src );

                for( i = elalign; i < IntFltLen; i += elalign )
                {
                    sum += ( fptypesimd :: load( ftp + i ) +
                        fptypesimd :: load( ftp2 + i ) * x ) *
                        fptypesimd :: loadu( Src + i );
                }

                DstLine[ 0 ] = sum.hadd();

                AVIR_RESIZE_PART2
            }
            else
            {
                AVIR_RESIZE_PART1nx

                fptypesimd sum = fptypesimd :: load( ftp ) *
                    fptypesimd :: loadu( Src );

                for( i = elalign; i < IntFltLen; i += elalign )
                {
                    sum += fptypesimd :: load( ftp + i ) *
                        fptypesimd :: loadu( Src + i );
                }

                DstLine[ 0 ] = sum.hadd();

                AVIR_RESIZE_PART2
            }
        }
        else
        if( DstLineIncr == 1 )
        {
            // Horizontal-oriented processing, element loop is outer.

            const int SrcIncr = InElIncr;
            const int DstLineElIncr = OutElIncr - DstLineIncr * DstLineLen;

            if( FltBank -> getOrder() == 1 )
            {
                for( j = 0; j < ElCount; j++ )
                {
                    AVIR_RESIZE_PART1

                    fptypesimd sum = 0.0;

                    for( i = 0; i < IntFltLen; i += elalign )
                    {
                        sum += ( fptypesimd :: load( ftp + i ) +
                            fptypesimd :: load( ftp2 + i ) * x ) *
                            fptypesimd :: loadu( Src + i );
                    }

                    DstLine[ 0 ] = sum.hadd();

                    AVIR_RESIZE_PART2

                    DstLine += DstLineElIncr;
                    SrcLine += SrcIncr;
                    DstLineLen = OutLen;
                    rpos = &(*RPosBuf)[ 0 ];
                }
            }
            else
            {
                for( j = 0; j < ElCount; j++ )
                {
                    AVIR_RESIZE_PART1nx

                    fptypesimd sum = fptypesimd :: load( ftp ) *
                        fptypesimd :: loadu( Src );

                    for( i = elalign; i < IntFltLen; i += elalign )
                    {
                        sum += fptypesimd :: load( ftp + i ) *
                            fptypesimd :: loadu( Src + i );
                    }

                    DstLine[ 0 ] = sum.hadd();

                    AVIR_RESIZE_PART2

                    DstLine += DstLineElIncr;
                    SrcLine += SrcIncr;
                    DstLineLen = OutLen;
                    rpos = &(*RPosBuf)[ 0 ];
                }
            }
        }
        else
        {
            const int SrcIncr = InElIncr;
            const int DstLineElIncr = OutElIncr;
            DstLineIncr -= DstLineElIncr * ElCount;

            if( FltBank -> getOrder() == 1 )
            {
                AVIR_RESIZE_PART1

                for( i = 0; i < IntFltLen; i += elalign )
                {
                    ( fptypesimd :: load( ftp + i ) +
                        fptypesimd :: load( ftp2 + i ) * x ).store( xx + i );
                }

                for( j = 0; j < ElCount; j++ )
                {
                    fptypesimd sum = fptypesimd :: load( xx ) *
                        fptypesimd :: loadu( Src );

                    for( i = elalign; i < IntFltLen; i += elalign )
                    {
                        sum += fptypesimd :: load( xx + i ) *
                            fptypesimd :: loadu( Src + i );
                    }

                    DstLine[ 0 ] = sum.hadd();
                    DstLine += DstLineElIncr;
                    Src += SrcIncr;
                }

                AVIR_RESIZE_PART2
            }
            else
            {
                AVIR_RESIZE_PART1nx

                for( j = 0; j < ElCount; j++ )
                {
                    fptypesimd sum = fptypesimd :: load( ftp ) *
                        fptypesimd :: loadu( Src );

                    for( i = elalign; i < IntFltLen; i += elalign )
                    {
                        sum += fptypesimd :: load( ftp + i ) *
                            fptypesimd :: loadu( Src + i );
                    }

                    DstLine[ 0 ] = sum.hadd();
                    DstLine += DstLineElIncr;
                    Src += SrcIncr;
                }

                AVIR_RESIZE_PART2
            }
        }

#undef AVIR_RESIZE_PART2
#undef AVIR_RESIZE_PART1nx
#undef AVIR_RESIZE_PART1
    }
};

/**
 * @brief Image resizer's default de-interleaved dithering class.
 *
 * This class defines an object that performs rounding, clipping and dithering
 * operations over horizontal scanline pixels before scanline is stored in the
 * output buffer.
 *
 * This ditherer implementation uses de-interleaved SIMD algorithm.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types cannot be used.
 * @tparam fptypesimd The SIMD type used to store a pack of "fptype" values.
 */

template< class fptype, class fptypesimd >
class CImageResizerDithererDefDIL
{
public:
    /**
     * Function initializes the ditherer object.
     *
     * @param aLen Scanline length in pixels to process.
     * @param aVars Image resizing-related variables.
     * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
     * truncation.
     * @param aPkOut Peak output value allowed.
     */

    void init( const int aLen, const CImageResizerVars& aVars,
        const double aTrMul, const double aPkOut )
    {
        Len = aLen;
        Vars = &aVars;
        LenE = aLen * Vars -> ElCount;
        TrMul0 = aTrMul;
        PkOut0 = aPkOut;
    }

    /**
     * @return "True" if dithering is recursive relative to scanlines meaning
     * multi-threaded execution is not supported by this dithering method.
     */

    static bool isRecursive()
    {
        return( false );
    }

    /**
     * Function performs rounding and clipping operations.
     *
     * @param ResScanline The buffer containing the final scanline.
     */

    void dither( fptype* const ResScanline ) const
    {
        const int elalign = Vars -> elalign;
        const fptypesimd c0 = 0.0;
        const fptypesimd PkOut = (fptypesimd) PkOut0;
        int j;

        if( TrMul0 == 1.0 )
        {
            // Optimization - do not perform bit truncation.

            for( j = 0; j < LenE - elalign; j += elalign )
            {
                const fptypesimd z0 = round(
                    fptypesimd :: loadu( ResScanline + j ));

                clamp( z0, c0, PkOut ).storeu( ResScanline + j );
            }

            const int lim = LenE - j;
            const fptypesimd z0 = round(
                fptypesimd :: loadu( ResScanline + j, lim ));

            clamp( z0, c0, PkOut ).storeu( ResScanline + j, lim );
        }
        else
        {
            const fptypesimd TrMul = (fptypesimd) TrMul0;

            for( j = 0; j < LenE - elalign; j += elalign )
            {
                const fptypesimd z0 = round(
                    fptypesimd :: loadu( ResScanline + j ) / TrMul ) * TrMul;

                clamp( z0, c0, PkOut ).storeu( ResScanline + j );
            }

            const int lim = LenE - j;
            const fptypesimd z0 = round(
                fptypesimd :: loadu( ResScanline + j, lim ) / TrMul ) * TrMul;

            clamp( z0, c0, PkOut ).storeu( ResScanline + j, lim );
        }
    }

protected:
    int Len; ///< Scanline's length in pixels.
        ///<
    const CImageResizerVars* Vars; ///< Image resizing-related variables.
        ///<
    int LenE; ///< = LenE * ElCount.
        ///<
    double TrMul0; ///< Bit-depth truncation multiplier.
        ///<
    double PkOut0; ///< Peak output value allowed.
        ///<
};

/**
 * @brief Image resizer's error-diffusion dithering class, de-interleaved
 * mode.
 *
 * This ditherer implements error-diffusion dithering which looks good, and
 * whose results are compressed by PNG well.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types cannot be used.
 * @tparam fptypesimd Processing type, SIMD can be used.
 */

template< class fptype, class fptypesimd >
class CImageResizerDithererErrdDIL
{
public:
    /**
     * Function initializes the ditherer object.
     *
     * @param aLen Scanline length in pixels to process.
     * @param aVars Image resizing-related variables.
     * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
     * truncation.
     * @param aPkOut Peak output value allowed.
     */

    void init( const int aLen, const CImageResizerVars& aVars,
        const double aTrMul, const double aPkOut )
    {
        Len = aLen;
        Vars = &aVars;
        LenE = aLen * Vars -> ElCount;
        TrMul0 = aTrMul;
        PkOut0 = aPkOut;

        ResScanlineDith0.alloc( LenE + Vars -> ElCount, sizeof( fptype ));
        ResScanlineDith = ResScanlineDith0 + Vars -> ElCount;
        int i;

        for( i = 0; i < LenE + Vars -> ElCount; i++ )
        {
            ResScanlineDith0[ i ] = 0.0;
        }
    }

    static bool isRecursive()
    {
        return( true );
    }

    void dither( fptype* const ResScanline )
    {
        const int ea = Vars -> elalign;
        const fptypesimd c0 = 0.0;
        const fptypesimd TrMul = (fptypesimd) TrMul0;
        const fptypesimd PkOut = (fptypesimd) PkOut0;
        int j;

        for( j = 0; j < LenE - ea; j += ea )
        {
            fptypesimd :: addu( ResScanline + j,
                fptypesimd :: loadu( ResScanlineDith + j ));

            c0.storeu( ResScanlineDith + j );
        }

        int lim = LenE - j;
        fptypesimd :: addu( ResScanline + j,
            fptypesimd :: loadu( ResScanlineDith + j, lim ), lim );

        c0.storeu( ResScanlineDith + j, lim );

        const int Len1 = Len - 1;
        fptype* rs = ResScanline;
        fptype* rsd = ResScanlineDith;
        int i;

        for( i = 0; i < Vars -> ElCount; i++ )
        {
            for( j = 0; j < Len1; j++ )
            {
                // Perform rounding, noise estimation and saturation.

                fptype* const rsj = rs + j;
                const fptype z0 = round( rsj[ 0 ] / TrMul ) * TrMul;
                const fptype Noise = rsj[ 0 ] - z0;
                rsj[ 0 ] = clamp( z0, (fptype) 0.0, PkOut );

                fptype* const rsdj = rsd + j;
                rsj[ 1 ] += Noise * (fptype) 0.364842;
                rsdj[ -1 ] += Noise * (fptype) 0.207305;
                rsdj[ 0 ] += Noise * (fptype) 0.364842;
                rsdj[ 1 ] += Noise * (fptype) 0.063011;
            }

            // Process the last pixel element in scanline.

            const fptype z1 = round( rs[ Len1 ] / TrMul ) * TrMul;
            const fptype Noise2 = rs[ Len1 ] - z1;
            rs[ Len1 ] = clamp( z1, c0, PkOut );

            rsd[ Len1 - 1 ] += Noise2 * (fptype) 0.207305;
            rsd[ Len1 ] += Noise2 * (fptype) 0.364842;

            rs += Len;
            rsd += Len;
        }
    }

protected:
    int Len; ///< Scanline's length in pixels.
        ///<
    const CImageResizerVars* Vars; ///< Image resizing-related variables.
        ///<
    int LenE; ///< = LenE * ElCount.
        ///<
    double TrMul0; ///< Bit-depth truncation multiplier.
        ///<
    double PkOut0; ///< Peak output value allowed.
        ///<
    CBuffer< fptype > ResScanlineDith0; ///< Error propagation buffer for
        ///< dithering, first pixel unused.
        ///<
    fptype* ResScanlineDith; ///< Error propagation buffer pointer which skips
        ///< the first ElCount elements.
        ///<
};

/**
 * @brief Floating-point processing definition and abstraction class for
 * de-interleaved processing.
 *
 * This class defines several constants and typedefs that point to classes
 * that should be used by the image resizing algorithm. This implementation
 * points to de-interleaved processing classes.
 *
 * @tparam afptype Floating point type to use for storing intermediate data
 * and variables. SIMD types should not be used.
 * @tparam afptypesimd SIMD type used to perform processing.
 * @tparam adith Ditherer class to use during processing.
 */

template< class afptype, class afptypesimd,
    class adith = CImageResizerDithererDefDIL< afptype, afptypesimd > >
class fpclass_def_dil
{
public:
    typedef afptype fptype; ///< Floating-point type to use during processing.
        ///<
    typedef afptype fptypeatom; ///< Atomic type "fptype" consists of.
        ///<
    static const int fppack = 1; ///< The number of atomic types stored in a
        ///< single "fptype" element.
        ///<
    static const int fpalign = sizeof( afptypesimd ); ///< Suggested alignment
        ///< size in bytes. This is not a required alignment, because image
        ///< resizing algorithm cannot be made to have a strictly aligned data
        ///< access in all cases (e.g. de-interleaved interpolation cannot
        ///< perform aligned accesses).
        ///<
    static const int elalign = sizeof( afptypesimd ) / sizeof( afptype ); ///<
        ///< Length alignment of arrays of elements. This applies to filters
        ///< and intermediate buffers: this constant forces filters and
        ///< scanlines to have a length which is a multiple of this value, for
        ///< more efficient SIMD implementation.
        ///<
    static const int packmode = 1; ///< 0 if interleaved packing, 1 if
        ///< de-interleaved.
        ///<
    typedef CImageResizerFilterStepDIL< fptype, afptypesimd > CFilterStep; ///<
        ///< Filtering step class to use during processing.
        ///<
    typedef adith CDitherer; ///< Ditherer class to use during processing.
        ///<
};

} // namespace avir