//$ nobt
//$ nocpp

/**
 * @file avir_float4_sse.h
 *
 * @brief Inclusion file for the "float4" type.
 *
 * This file includes the "float4" SSE-based type used for SIMD variable
 * storage and processing.
 *
 * AVIR Copyright (c) 2015-2019 Aleksey Vaneev
 */

#ifndef AVIR_FLOAT4_SSE_INCLUDED
#define AVIR_FLOAT4_SSE_INCLUDED

#include <xmmintrin.h>
#include <emmintrin.h>

namespace avir {

/**
 * @brief SIMD packed 4-float type.
 *
 * This class implements a packed 4-float type that can be used to perform
 * parallel computation using SIMD instructions on SSE-enabled processors.
 * This class can be used as the "fptype" argument of the avir::fpclass_def
 * class.
 */

class float4
{
public:
    float4()
    {
    }

    float4( const float4& s )
        : value( s.value )
    {
    }

    float4( const __m128 s )
        : value( s )
    {
    }

    float4( const float s )
        : value( _mm_set1_ps( s ))
    {
    }

    float4& operator = ( const float4& s )
    {
        value = s.value;
        return( *this );
    }

    float4& operator = ( const __m128 s )
    {
        value = s;
        return( *this );
    }

    float4& operator = ( const float s )
    {
        value = _mm_set1_ps( s );
        return( *this );
    }

    operator float () const
    {
        return( _mm_cvtss_f32( value ));
    }

    /**
     * @param p Pointer to memory from where the value should be loaded,
     * should be 16-byte aligned.
     * @return float4 value loaded from the specified memory location.
     */

    static float4 load( const float* const p )
    {
        return( _mm_load_ps( p ));
    }

    /**
     * @param p Pointer to memory from where the value should be loaded,
     * may have any alignment.
     * @return float4 value loaded from the specified memory location.
     */

    static float4 loadu( const float* const p )
    {
        return( _mm_loadu_ps( p ));
    }

    /**
     * @param p Pointer to memory from where the value should be loaded,
     * may have any alignment.
     * @param lim The maximum number of elements to load, >0.
     * @return float4 value loaded from the specified memory location, with
     * elements beyond "lim" set to 0.
     */

    static float4 loadu( const float* const p, int lim )
    {
        if( lim > 2 )
        {
            if( lim > 3 )
            {
                return( _mm_loadu_ps( p ));
            }
            else
            {
                return( _mm_set_ps( 0.0f, p[ 2 ], p[ 1 ], p[ 0 ]));
            }
        }
        else
        {
            if( lim == 2 )
            {
                return( _mm_set_ps( 0.0f, 0.0f, p[ 1 ], p[ 0 ]));
            }
            else
            {
                return( _mm_load_ss( p ));
            }
        }
    }

    /**
     * Function stores *this value to the specified memory location.
     *
     * @param[out] p Output memory location, should be 16-byte aligned.
     */

    void store( float* const p ) const
    {
        _mm_store_ps( p, value );
    }

    /**
     * Function stores *this value to the specified memory location.
     *
     * @param[out] p Output memory location, may have any alignment.
     */

    void storeu( float* const p ) const
    {
        _mm_storeu_ps( p, value );
    }

    /**
     * Function stores "lim" lower elements of *this value to the specified
     * memory location.
     *
     * @param[out] p Output memory location, may have any alignment.
     * @param lim The number of lower elements to store, >0.
     */

    void storeu( float* const p, int lim ) const
    {
        if( lim > 2 )
        {
            if( lim > 3 )
            {
                _mm_storeu_ps( p, value );
            }
            else
            {
                _mm_storel_pi( (__m64*) (void*) p, value );
                _mm_store_ss( p + 2, _mm_movehl_ps( value, value ));
            }
        }
        else
        {
            if( lim == 2 )
            {
                _mm_storel_pi( (__m64*) (void*) p, value );
            }
            else
            {
                _mm_store_ss( p, value );
            }
        }
    }

    float4& operator += ( const float4& s )
    {
        value = _mm_add_ps( value, s.value );
        return( *this );
    }

    float4& operator -= ( const float4& s )
    {
        value = _mm_sub_ps( value, s.value );
        return( *this );
    }

    float4& operator *= ( const float4& s )
    {
        value = _mm_mul_ps( value, s.value );
        return( *this );
    }

    float4& operator /= ( const float4& s )
    {
        value = _mm_div_ps( value, s.value );
        return( *this );
    }

    float4 operator + ( const float4& s ) const
    {
        return( _mm_add_ps( value, s.value ));
    }

    float4 operator - ( const float4& s ) const
    {
        return( _mm_sub_ps( value, s.value ));
    }

    float4 operator * ( const float4& s ) const
    {
        return( _mm_mul_ps( value, s.value ));
    }

    float4 operator / ( const float4& s ) const
    {
        return( _mm_div_ps( value, s.value ));
    }

    /**
     * @return Horizontal sum of elements.
     */

    float hadd() const
    {
        const __m128 v = _mm_add_ps( value, _mm_movehl_ps( value, value ));
        const __m128 res = _mm_add_ss( v, _mm_shuffle_ps( v, v, 1 ));
        return( _mm_cvtss_f32( res ));
    }

    /**
     * Function performs in-place addition of a value located in memory and
     * the specified value.
     *
     * @param p Pointer to value where addition happens. May be unaligned.
     * @param v Value to add.
     */

    static void addu( float* const p, const float4& v )
    {
        ( loadu( p ) + v ).storeu( p );
    }

    /**
     * Function performs in-place addition of a value located in memory and
     * the specified value. Limited to the specfied number of elements.
     *
     * @param p Pointer to value where addition happens. May be unaligned.
     * @param v Value to add.
     * @param lim The element number limit, >0.
     */

    static void addu( float* const p, const float4& v, const int lim )
    {
        ( loadu( p, lim ) + v ).storeu( p, lim );
    }

    __m128 value; ///< Packed value of 4 floats.
        ///<
};

/**
 * SIMD rounding function, exact result.
 *
 * @param v Value to round.
 * @return Rounded SIMD value.
 */

inline float4 round( const float4& v )
{
    unsigned int prevrm = _MM_GET_ROUNDING_MODE();
    _MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );

    const __m128 res = _mm_cvtepi32_ps( _mm_cvtps_epi32( v.value ));

    _MM_SET_ROUNDING_MODE( prevrm );

    return( res );
}

/**
 * SIMD function "clamps" (clips) the specified packed values so that they are
 * not lesser than "minv", and not greater than "maxv".
 *
 * @param Value Value to clamp.
 * @param minv Minimal allowed value.
 * @param maxv Maximal allowed value.
 * @return The clamped value.
 */

inline float4 clamp( const float4& Value, const float4& minv,
    const float4& maxv )
{
    return( _mm_min_ps( _mm_max_ps( Value.value, minv.value ), maxv.value ));
}

typedef fpclass_def< avir :: float4, float > fpclass_float4; ///<
    ///< Class that can be used as the "fpclass" template parameter of the
    ///< avir::CImageResizer class to perform calculation using default
    ///< interleaved algorithm, using SIMD float4 type.
    ///<

} // namespace avir

#endif // AVIR_FLOAT4_SSE_INCLUDED