Table of Contents

Target Platform(s):

include/AK/SoundEngine/Platforms/arm_neon/AkSimd.h File Reference

Go to the source code of this file.

Defines

Platform specific memory size alignment for allocation purposes

#define  AKSIMD_ALIGNSIZE(__Size__)   (((__Size__) + 15) & ~15)
AKSIMD loading / setting

#define  AKSIMD_LOAD_V4F32(__addr__)   vld1q_f32( (float32_t*)(__addr__) )
  Loads four single-precision, floating-point values (see _mm_load_ps).
#define  AKSIMD_LOADU_V4F32(__addr__)   vld1q_f32( (float32_t*)(__addr__) )
#define  AKSIMD_LOAD1_V4F32(__scalar__)   vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
#define  AKSIMD_SET_V4F32(__scalar__)   vdupq_n_f32( __scalar__ )
#define  AKSIMD_SET_V4I32(__scalar__)   vdupq_n_s32( __scalar__ )
  Sets the four integer values to __scalar__.
#define  AKSIMD_SETZERO_V4F32()   AKSIMD_SET_V4F32( 0 )
#define  AKSIMD_LOAD_SS_V4F32(__addr__)   vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
#define  AKSIMD_LOAD_V4I32(__addr__)   vld1q_s32( (const int32_t*)(__addr__) )
  Loads four 32-bit signed integer values (aligned).
#define  AKSIMD_LOAD_V8I16(__addr__)   vld1q_s16( (const int16_t*)(__addr__) )
  Loads 8 16-bit signed integer values (aligned).
#define  AKSIMD_LOAD_V4I16(__addr__)   vld1_s16( (const int16_t*)(__addr__) )
  Loads 4 16-bit signed integer values (aligned).
#define  AKSIMD_LOADU_V4I32(__addr__)   vld1q_s32( (const int32_t*)(__addr__))
  Loads unaligned 128-bit value (see _mm_loadu_si128).
#define  AKSIMD_SETZERO_V4I32()   vdupq_n_s32( 0 )
  Sets the four 32-bit integer values to zero (see _mm_setzero_si128).
#define  AKSIMD_LOAD_V2F32(__addr__)   vld1_f32( (float32_t*)(__addr__) )
  Loads two single-precision, floating-point values.
#define  AKSIMD_LOAD_V2F32_LANE(__addr__, __vec__, __lane__)   vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
#define  AKSIMD_SET_V2F32(__scalar__)   vdup_n_f32( __scalar__ )
  Sets the two single-precision, floating-point values to __scalar__.
#define  AKSIMD_SETZERO_V2F32()   AKSIMD_SET_V2F32( 0 )
  Sets the two single-precision, floating-point values to zero.
#define  AKSIMD_LOAD_V4F32X2(__addr__)   vld2q_f32( (float32_t*)(__addr__) )
  Loads data from memory and de-interleaves.
#define  AKSIMD_LOAD_V2F32X2(__addr__)   vld2_f32( (float32_t*)(__addr__) )
#define  AKSIMD_LOAD_V2F32X2_LANE(__addr__, __vec__, __lane__)   vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
  Loads data from memory and de-interleaves; only selected lane.
#define  AKSIMD_LOAD_V4F32X4_LANE(__addr__, __vec__, __lane__)   vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
AKSIMD storing

#define  AKSIMD_STORE_V4F32(__addr__, __vName__)   vst1q_f32( (float32_t*)(__addr__), (__vName__) )
  Stores four single-precision, floating-point values. The address must be 16-byte aligned.
#define  AKSIMD_STOREU_V4F32(__addr__, __vec__)   vst1q_f32( (float32_t*)(__addr__), (__vec__) )
  Stores four single-precision, floating-point values. The address does not need to be 16-byte aligned.
#define  AKSIMD_STORE1_V4F32(__addr__, __vec__)   vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
#define  AKSIMD_STORE_V4I32(__addr__, __vec__)   vst1q_s32( (int32_t*)(__addr__), (__vec__) )
  Stores four 32-bit integer values. The address must be 16-byte aligned.
#define  AKSIMD_STOREU_V4I32(__addr__, __vec__)   vst1q_s32( (int32_t*)(__addr__), (__vec__) )
  Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
#define  AKSIMD_STOREU_V4UI32(__addr__, __vec__)   vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
  Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
#define  AKSIMD_STORE_V2F32(__addr__, __vName__)   vst1_f32( (AkReal32*)(__addr__), (__vName__) )
  Stores two single-precision, floating-point values. The address must be 16-byte aligned.
#define  AKSIMD_STORE_V4F32X2(__addr__, __vName__)   vst2q_f32( (float32_t*)(__addr__), (__vName__) )
  Stores data by interleaving into memory.
#define  AKSIMD_STORE_V2F32X2(__addr__, __vName__)   vst2_f32( (float32_t*)(__addr__), (__vName__) )
AKSIMD conversion

#define  AKSIMD_CONVERT_V4I32_TO_V4F32(__vec__)   vcvtq_f32_s32( __vec__ )
#define  AKSIMD_CONVERT_V4F32_TO_V4I32(__vec__)   vcvtq_s32_f32( __vec__ )
#define  AKSIMD_TRUNCATE_V4F32_TO_V4I32(__vec__)   vcvtq_s32_f32( (__vec__) )
#define  AKSIMD_CONVERT_V2F32_TO_V2I32(__vec__)   vcvt_s32_f32( __vec__ )
AKSIMD shifting

#define  AKSIMD_SHIFTLEFT_V4I32(__vec__, __shiftBy__)   vshlq_n_s32( (__vec__), (__shiftBy__) )
#define  AKSIMD_SHIFTRIGHTARITH_V4I32(__vec__, __shiftBy__)   vrshrq_n_s32( (__vec__), (__shiftBy__) )

Typedefs

AKSIMD types

typedef int32x4_t  AKSIMD_V4I32
  Vector of 4 32-bit signed integers.
typedef int16x8_t  AKSIMD_V8I16
  Vector of 8 16-bit signed integers.
typedef int16x4_t  AKSIMD_V4I16
  Vector of 4 16-bit signed integers.
typedef uint32x4_t  AKSIMD_V4UI32
  Vector of 4 32-bit unsigned signed integers.
typedef uint32x2_t  AKSIMD_V2UI32
  Vector of 2 32-bit unsigned signed integers.
typedef int32x2_t  AKSIMD_V2I32
  Vector of 2 32-bit signed integers.
typedef float32_t  AKSIMD_F32
  32-bit float
typedef float32x2_t  AKSIMD_V2F32
  Vector of 2 32-bit floats.
typedef float32x4_t  AKSIMD_V4F32
  Vector of 4 32-bit floats.
typedef uint32x4_t  AKSIMD_V4COND
  Vector of 4 comparison results.
typedef uint32x4_t  AKSIMD_V4ICOND
  Vector of 4 comparison results.
typedef uint32x4_t  AKSIMD_V4FCOND
  Vector of 4 comparison results.
typedef float32x2x2_t  AKSIMD_V2F32X2
typedef float32x4x2_t  AKSIMD_V4F32X2
typedef float32x4x4_t  AKSIMD_V4F32X4

AKSIMD logical operations



#define  AKSIMD_AND_V4I32(__a__, __b__)   vandq_s32( (__a__), (__b__) )
#define  AKSIMD_CMPGT_V8I16(__a__, __b__)   vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
#define  AKSIMD_CMPLE_V4F32(__a__, __b__)   vcleq_f32( (__a__), (__b__) )
  Compares for less than or equal (see _mm_cmple_ps).
#define  AKSIMD_CMPLT_V4I32(__a__, __b__)   vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
#define  AKSIMD_CMPGT_V4I32(__a__, __b__)   vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
#define  AKSIMD_XOR_V4I32(__a__, __b__)   veorq_s32(__a__, __b__)
#define  AKSIMD_SUB_V4I32(__a__, __b__)   vsubq_s32(__a__, __b__)
static AkForceInline AKSIMD_V4F32  AKSIMD_XOR_V4F32 (const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)

AKSIMD shuffling



#define  AKSIMD_COMBINE_V2F32(a, b)   vcombine_f32( a, b )
#define  AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
#define  AKSIMD_SHUFFLE_V4F32(a, b, zyxw)   _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
#define  AKSIMD_SHUFFLE_BCDA(__a__)   AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
  Barrel-shift all floats by one.
#define  AKSIMD_SHUFFLE_BADC(__a__)   vrev64q_f32( __a__ )
  Swap the 2 lower floats together and the 2 higher floats together.
#define  AKSIMD_SHUFFLE_CDAB(__a__)   vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
  Swap the 2 lower floats with the 2 higher floats.
#define  AKSIMD_DUP_ODD(__vv)   AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
  Duplicates the odd items into the even items (d c b a -> d d b b ).
#define  AKSIMD_DUP_EVEN(__vv)   AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
  Duplicates the even items into the odd items (d c b a -> c c a a ).
AKSIMD_V4F32  AKSIMD_MOVEHL_V4F32 (const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw)
AKSIMD_V4F32  AKSIMD_MOVELH_V4F32 (const AKSIMD_V4F32 &xyzw, const AKSIMD_V4F32 &abcd)

AKSIMD arithmetic



#define  AKSIMD_SUB_V4F32(__a__, __b__)   vsubq_f32( (__a__), (__b__) )
#define  AKSIMD_SUB_V2F32(__a__, __b__)   vsub_f32( (__a__), (__b__) )
#define  AKSIMD_SUB_SS_V4F32(__a__, __b__)   vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
#define  AKSIMD_ADD_V4F32(__a__, __b__)   vaddq_f32( (__a__), (__b__) )
#define  AKSIMD_ADD_V2F32(__a__, __b__)   vadd_f32( (__a__), (__b__) )
#define  AKSIMD_ADD_V4I32(__a__, __b__)   vaddq_s32( (__a__), (__b__) )
  Adds the four integers of a and b.
#define  AKSIMD_MULLO16_V4I32(__a__, __b__)   vmulq_s32(__a__, __b__)
  Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow).
#define  AKSIMD_COMP_V4F32(__a__, __b__)   vceqq_f32( (__a__), (__b__) )
#define  AKSIMD_COMP_V2F32(__a__, __b__)   vceq_f32( (__a__), (__b__) )
#define  AKSIMD_ADD_SS_V4F32(__a__, __b__)   vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
#define  AKSIMD_MUL_V4F32(__a__, __b__)   vmulq_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_V4F32_SCALAR(__a__, __b__)   vmulq_n_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_V2F32(__a__, __b__)   vmul_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_V2F32_SCALAR(__a__, __b__)   vmul_n_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_SS_V4F32(__a__, __b__)   vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
#define  AKSIMD_MADD_V4F32(__a__, __b__, __c__)   vmlaq_f32( (__c__), (__a__), (__b__) )
  Vector multiply-add operation.
#define  AKSIMD_MSUB_V4F32(__a__, __b__, __c__)   AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
  Vector multiply-substract operation. Careful: vmlsq_f32 does c-(a*b) and not the expected (a*b)-c.
#define  AKSIMD_MADD_V2F32(__a__, __b__, __c__)   AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
#define  AKSIMD_MSUB_V2F32(__a__, __b__, __c__)   AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
#define  AKSIMD_MADD_V4F32_SCALAR(__a__, __b__, __c__)   vmlaq_n_f32( (__c__), (__a__), (__b__) )
#define  AKSIMD_MADD_V2F32_SCALAR(__a__, __b__, __c__)   vmla_n_f32( (__c__), (__a__), (__b__) )
#define  AKSIMD_MIN_V4F32(__a__, __b__)   vminq_f32( (__a__), (__b__) )
#define  AKSIMD_MIN_V2F32(__a__, __b__)   vmin_f32( (__a__), (__b__) )
#define  AKSIMD_MAX_V4F32(__a__, __b__)   vmaxq_f32( (__a__), (__b__) )
#define  AKSIMD_MAX_V2F32(__a__, __b__)   vmax_f32( (__a__), (__b__) )
#define  AKSIMD_ABS_V4F32(__a__)   vabsq_f32((__a__))
  Returns absolute value.
#define  AKSIMD_NEG_V2F32(__a__)   vneg_f32( (__a__) )
  Changes the sign.
#define  AKSIMD_NEG_V4F32(__a__)   vnegq_f32( (__a__) )
#define  AKSIMD_SQRT_V4F32(__vec__)   vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
  Square root (4 floats).
#define  AKSIMD_RSQRT_V4F32(__a__)   vrsqrteq_f32( (__a__) )
  Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a).
#define  AKSIMD_SQRT_V2F32(__vec__)   vrecpe_f32( vrsqrte_f32( __vec__ ) )
  Square root (2 floats).
AkForceInline AKSIMD_V4F32  AKSIMD_DIV_V4F32 (AKSIMD_V4F32 a, AKSIMD_V4F32 b)
  Rough estimation of division.
AkForceInline AKSIMD_V4F32  AKSIMD_MADD_SS_V4F32 (const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
  Vector multiply-add operation.
static AkForceInline void  AKSIMD_HORIZONTALADD (AKSIMD_V4F32 &vVec)
static AkForceInline AKSIMD_V4F32  AKSIMD_COMPLEXMUL (AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
  Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts.

AKSIMD packing / unpacking



#define  AKSIMD_UNPACKLO_VECTOR8I16(__a__, __b__)   vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
#define  AKSIMD_UNPACKHI_VECTOR8I16(__a__, __b__)   vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
#define  AKSIMD_HILO_V2F32(in_vec1, in_vec2)   vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
#define  AKSIMD_TRANSPOSE_V2F32(in_vec1, in_vec2)   vtrn_f32( in_vec1, in_vec2 )
#define  AKSIMD_TRANSPOSE_V4F32(in_vec1, in_vec2)   vtrnq_f32( in_vec1, in_vec2 )
#define  AKSIMD_SWAP_V2F32(in_vec)   vrev64_f32( in_vec )
  V1 = {a,b} => VR = {b,a}.
AkForceInline AKSIMD_V4F32  AKSIMD_UNPACKLO_V4F32 (const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
AkForceInline AKSIMD_V4F32  AKSIMD_UNPACKHI_V4F32 (const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
AkForceInline AKSIMD_V4I32  AKSIMD_PACKS_V4I32 (const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)

AKSIMD vector comparison

Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.



#define  AKSIMD_CMP_CTRLMASK   uint32x4_t
#define  AKSIMD_GTEQ_V4F32(__a__, __b__)   vcgeq_f32( (__a__), (__b__))
  Compare each float element and return control mask.
#define  AKSIMD_GT_V4F32(__a__, __b__)   vcgtq_f32( (__a__), (__b__))
  Compare each float element and return control mask.
#define  AKSIMD_LTEQ_V4F32(__a__, __b__)   vcleq_f32( (__a__), (__b__))
  Compare each float element and return control mask.
#define  AKSIMD_LT_V4F32(__a__, __b__)   vcltq_f32( (__a__), (__b__))
  Compare each float element and return control mask.
#define  AKSIMD_GTEQ_V4I32(__a__, __b__)   vcgeq_s32( (__a__), (__b__))
  Compare each integer element and return control mask.
#define  AKSIMD_EQ_V4F32(__a__, __b__)   vceqq_f32( (__a__), (__b__))
  Compare each float element and return control mask.
#define  AKSIMD_EQ_V4I32(__a__, __b__)   vceqq_s32( (__a__), (__b__))
  Compare each integer element and return control mask.
#define  AKSIMD_VSEL_V4F32(__a__, __b__, __c__)   vbslq_f32( (__c__), (__b__), (__a__) )
  Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations.
#define  AKSIMD_SEL_GTEQ_V4F32(__a__, __b__, __cond1__, __cond2__)   AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
#define  AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)   AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
#define  AKSIMD_SPLAT_V4F32(var, idx)   vmovq_n_f32(vgetq_lane_f32(var, idx))
static AkForceInline int  AKSIMD_MASK_V4F32 (const AKSIMD_V4UI32 &in_vec1)

Detailed Description

AKSIMD - arm_neon implementation

Definition in file AkSimd.h.