include/AK/SoundEngine/Platforms/arm_neon/AkSimd.h File Reference

Go to the source code of this file.

Defines
Platform specific memory size alignment for allocation purposes

#define	AKSIMD_ALIGNSIZE(__Size__) (((__Size__) + 15) & ~15)
AKSIMD loading / setting

#define	AKSIMD_LOAD_V4F32(__addr__) vld1q_f32( (float32_t*)(__addr__) )
	Loads four single-precision, floating-point values (see _mm_load_ps).
#define	AKSIMD_LOADU_V4F32(__addr__) vld1q_f32( (float32_t*)(__addr__) )
#define	AKSIMD_LOAD1_V4F32(__scalar__) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
#define	AKSIMD_SET_V4F32(__scalar__) vdupq_n_f32( __scalar__ )
#define	AKSIMD_SET_V4I32(__scalar__) vdupq_n_s32( __scalar__ )
	Sets the four integer values to __scalar__.
#define	AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
#define	AKSIMD_LOAD_SS_V4F32(__addr__) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
#define	AKSIMD_LOAD_V4I32(__addr__) vld1q_s32( (const int32_t*)(__addr__) )
	Loads four 32-bit signed integer values (aligned).
#define	AKSIMD_LOAD_V8I16(__addr__) vld1q_s16( (const int16_t*)(__addr__) )
	Loads 8 16-bit signed integer values (aligned).
#define	AKSIMD_LOAD_V4I16(__addr__) vld1_s16( (const int16_t*)(__addr__) )
	Loads 4 16-bit signed integer values (aligned).
#define	AKSIMD_LOADU_V4I32(__addr__) *__addr__
	Loads unaligned 128-bit value (see _mm_loadu_si128).
#define	AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
	Sets the four 32-bit integer values to zero (see _mm_setzero_si128).
#define	AKSIMD_LOAD_V2F32(__addr__) vld1_f32( (float32_t*)(__addr__) )
	Loads two single-precision, floating-point values.
#define	AKSIMD_LOAD_V2F32_LANE(__addr__, __vec__, __lane__) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
#define	AKSIMD_SET_V2F32(__scalar__) vdup_n_f32( __scalar__ )
	Sets the two single-precision, floating-point values to __scalar__.
#define	AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
	Sets the two single-precision, floating-point values to zero.
#define	AKSIMD_LOAD_V4F32X2(__addr__) vld2q_f32( (float32_t*)(__addr__) )
	Loads data from memory and de-interleaves.
#define	AKSIMD_LOAD_V2F32X2(__addr__) vld2_f32( (float32_t*)(__addr__) )
#define	AKSIMD_LOAD_V2F32X2_LANE(__addr__, __vec__, __lane__) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
	Loads data from memory and de-interleaves; only selected lane.
#define	AKSIMD_LOAD_V4F32X4_LANE(__addr__, __vec__, __lane__) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
AKSIMD storing

#define	AKSIMD_STORE_V4F32(__addr__, __vName__) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
	Stores four single-precision, floating-point values. The address must be 16-byte aligned.
#define	AKSIMD_STOREU_V4F32(__addr__, __vec__) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
	Stores four single-precision, floating-point values. The address does not need to be 16-byte aligned.
#define	AKSIMD_STORE1_V4F32(__addr__, __vec__) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
#define	AKSIMD_STORE_V4I32(__addr__, __vec__) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
	Stores four 32-bit integer values. The address must be 16-byte aligned.
#define	AKSIMD_STOREU_V4I32(__addr__, __vec__) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
	Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
#define	AKSIMD_STOREU_V4UI32(__addr__, __vec__) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
	Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
#define	AKSIMD_STORE_V2F32(__addr__, __vName__) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
	Stores two single-precision, floating-point values. The address must be 16-byte aligned.
#define	AKSIMD_STORE_V4F32X2(__addr__, __vName__) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
	Stores data by interleaving into memory.
#define	AKSIMD_STORE_V2F32X2(__addr__, __vName__) vst2_f32( (float32_t*)(__addr__), (__vName__) )
AKSIMD conversion

#define	AKSIMD_CONVERT_V4I32_TO_V4F32(__vec__) vcvtq_f32_s32( __vec__ )
#define	AKSIMD_CONVERT_V4F32_TO_V4I32(__vec__) vcvtq_s32_f32( __vec__ )
#define	AKSIMD_TRUNCATE_V4F32_TO_V4I32(__vec__) vcvtq_s32_f32( (__vec__) )
#define	AKSIMD_CONVERT_V2F32_TO_V2I32(__vec__) vcvt_s32_f32( __vec__ )
AKSIMD logical operations

#define	AKSIMD_AND_V4I32(__a__, __b__) vandq_s32( (__a__), (__b__) )
#define	AKSIMD_CMPGT_V8I16(__a__, __b__) vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
#define	AKSIMD_CMPLE_V4F32(__a__, __b__) vcleq_f32( (__a__), (__b__) )
	Compares for less than or equal (see _mm_cmple_ps).
AKSIMD shifting

#define	AKSIMD_SHIFTLEFT_V4I32(__vec__, __shiftBy__) vshlq_n_s32( (__vec__), (__shiftBy__) )
#define	AKSIMD_SHIFTRIGHTARITH_V4I32(__vec__, __shiftBy__) vrshrq_n_s32( (__vec__), (__shiftBy__) )
AKSIMD vector comparison
Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
#define	AKSIMD_CMP_CTRLMASK uint32x4_t
#define	AKSIMD_GTEQ_V4F32(__a__, __b__) vcgeq_f32( (__a__), (__b__))
	Compare each float element and return control mask.
#define	AKSIMD_GTEQ_V4I32(__a__, __b__) vcgeq_s32( (__a__), (__b__))
	Compare each integer element and return control mask.
#define	AKSIMD_EQ_V4F32(__a__, __b__) vceqq_f32( (__a__), (__b__))
	Compare each float element and return control mask.
#define	AKSIMD_EQ_V4I32(__a__, __b__) vceqq_s32( (__a__), (__b__))
	Compare each integer element and return control mask.
#define	AKSIMD_VSEL_V4F32(__a__, __b__, __c__) vbslq_f32( (__c__), (__b__), (__a__) )
	Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations.
#define	AKSIMD_SEL_GTEQ_V4F32(__a__, __b__, __cond1__, __cond2__) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
#define	AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, AKSIMD_SETZERO_V4F32() ) )
#define	AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
AKSIMD types

#define	AKSIMD_V4F32_SUPPORTED
typedef int32x4_t	AKSIMD_V4I32
	Vector of 4 32-bit signed integers.
typedef int16x8_t	AKSIMD_V8I16
	Vector of 8 16-bit signed integers.
typedef int16x4_t	AKSIMD_V4I16
	Vector of 4 16-bit signed integers.
typedef uint32x4_t	AKSIMD_V4UI32
	Vector of 4 32-bit unsigned signed integers.
typedef uint32x2_t	AKSIMD_V2UI32
	Vector of 2 32-bit unsigned signed integers.
typedef int32x2_t	AKSIMD_V2I32
	Vector of 2 32-bit signed integers.
typedef float32_t	AKSIMD_F32
	32-bit float
typedef float32x2_t	AKSIMD_V2F32
	Vector of 2 32-bit floats.
typedef float32x4_t	AKSIMD_V4F32
	Vector of 4 32-bit floats.
typedef uint32x4_t	AKSIMD_V4COND
	Vector of 4 comparison results.
typedef uint32x4_t	AKSIMD_V4ICOND
	Vector of 4 comparison results.
typedef uint32x4_t	AKSIMD_V4FCOND
	Vector of 4 comparison results.
typedef float32x2x2_t	AKSIMD_V2F32X2
typedef float32x4x2_t	AKSIMD_V4F32X2
typedef float32x4x4_t	AKSIMD_V4F32X4
AKSIMD shuffling

#define	AKSIMD_COMBINE_V2F32(a, b) vcombine_f32( a, b )
#define	AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0) (((fp3) << 6) \| ((fp2) << 4) \| ((fp1) << 2) \| ((fp0)))
#define	AKSIMD_SHUFFLE_V4F32(a, b, zyxw) _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
#define	AKSIMD_SHUFFLE_BADC(__a__) vrev64q_f32( __a__ )
	Swap the 2 lower floats together and the 2 higher floats together.
#define	AKSIMD_SHUFFLE_CDAB(__a__) vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
	Swap the 2 lower floats with the 2 higher floats.
#define	AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
	Duplicates the odd items into the even items (d c b a -> d d b b ).
#define	AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
	Duplicates the even items into the odd items (d c b a -> c c a a ).
AKSIMD_V4F32	AKSIMD_MOVEHL_V4F32 (const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw)
AKSIMD_V4F32	AKSIMD_MOVELH_V4F32 (const AKSIMD_V4F32 &xyzw, const AKSIMD_V4F32 &abcd)
AKSIMD arithmetic

#define	AKSIMD_SUB_V4F32(__a__, __b__) vsubq_f32( (__a__), (__b__) )
#define	AKSIMD_SUB_V2F32(__a__, __b__) vsub_f32( (__a__), (__b__) )
#define	AKSIMD_SUB_SS_V4F32(__a__, __b__) vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
#define	AKSIMD_ADD_V4F32(__a__, __b__) vaddq_f32( (__a__), (__b__) )
#define	AKSIMD_ADD_V2F32(__a__, __b__) vadd_f32( (__a__), (__b__) )
#define	AKSIMD_ADD_V4I32(__a__, __b__) vaddq_s32( (__a__), (__b__) )
	Adds the four integers of a and b.
#define	AKSIMD_COMP_V4F32(__a__, __b__) vceqq_f32( (__a__), (__b__) )
#define	AKSIMD_COMP_V2F32(__a__, __b__) vceq_f32( (__a__), (__b__) )
#define	AKSIMD_ADD_SS_V4F32(__a__, __b__) vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
#define	AKSIMD_MUL_V4F32(__a__, __b__) vmulq_f32( (__a__), (__b__) )
#define	AKSIMD_MUL_V4F32_SCALAR(__a__, __b__) vmulq_n_f32( (__a__), (__b__) )
#define	AKSIMD_MUL_V2F32(__a__, __b__) vmul_f32( (__a__), (__b__) )
#define	AKSIMD_MUL_V2F32_SCALAR(__a__, __b__) vmul_n_f32( (__a__), (__b__) )
#define	AKSIMD_MUL_SS_V4F32(__a__, __b__) vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
#define	AKSIMD_MADD_V4F32(__a__, __b__, __c__) AKSIMD_ADD_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
	Vector multiply-add operation.
#define	AKSIMD_MSUB_V4F32(__a__, __b__, __c__) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
#define	AKSIMD_MADD_V2F32(__a__, __b__, __c__) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
#define	AKSIMD_MSUB_V2F32(__a__, __b__, __c__) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
#define	AKSIMD_MADD_V4F32_INST(__a__, __b__, __c__) vmlaq_f32( (__c__), (__a__), (__b__) )
#define	AKSIMD_MADD_V2F32_INST(__a__, __b__, __c__) vmla_f32( (__c__), (__a__), (__b__) )
#define	AKSIMD_MADD_V4F32_SCALAR(__a__, __b__, __c__) vmlaq_n_f32( (__c__), (__a__), (__b__) )
#define	AKSIMD_MADD_V2F32_SCALAR(__a__, __b__, __c__) vmla_n_f32( (__c__), (__a__), (__b__) )
#define	AKSIMD_MIN_V4F32(__a__, __b__) vminq_f32( (__a__), (__b__) )
#define	AKSIMD_MIN_V2F32(__a__, __b__) vmin_f32( (__a__), (__b__) )
#define	AKSIMD_MAX_V4F32(__a__, __b__) vmaxq_f32( (__a__), (__b__) )
#define	AKSIMD_MAX_V2F32(__a__, __b__) vmax_f32( (__a__), (__b__) )
#define	AKSIMD_ABS_V4F32(__a__) vabsq_f32((__a__))
	Returns absolute value.
#define	AKSIMD_NEG_V2F32(__a__) vneg_f32( (__a__) )
	Changes the sign.
#define	AKSIMD_NEG_V4F32(__a__) vnegq_f32( (__a__) )
#define	AKSIMD_SQRT_V4F32(__vec__) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
	Square root (4 floats).
#define	AKSIMD_SQRT_V2F32(__vec__) vrecpe_f32( vrsqrte_f32( __vec__ ) )
	Square root (2 floats).
AkForceInline AKSIMD_V4F32	AKSIMD_DIV_V4F32 (AKSIMD_V4F32 a, AKSIMD_V4F32 b)
	Rough estimation of division.
AkForceInline AKSIMD_V4F32	AKSIMD_MADD_SS_V4F32 (const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
	Vector multiply-add operation.
static AkForceInline void	AKSIMD_HORIZONTALADD (AKSIMD_V4F32 &vVec)
static AkForceInline AKSIMD_V4F32	AKSIMD_COMPLEXMUL (AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
	Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts.
AKSIMD packing / unpacking

#define	AKSIMD_UNPACKLO_VECTOR8I16(__a__, __b__) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
#define	AKSIMD_UNPACKHI_VECTOR8I16(__a__, __b__) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
#define	AKSIMD_HILO_V2F32(in_vec1, in_vec2) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
#define	AKSIMD_TRANSPOSE_V2F32(in_vec1, in_vec2) vtrn_f32( in_vec1, in_vec2 )
#define	AKSIMD_TRANSPOSE_V4F32(in_vec1, in_vec2) vtrnq_f32( in_vec1, in_vec2 )
#define	AKSIMD_SWAP_V2F32(in_vec) vrev64_f32( in_vec )
	V1 = {a,b} => VR = {b,a}.
AkForceInline AKSIMD_V4F32	AKSIMD_UNPACKLO_V4F32 (const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
AkForceInline AKSIMD_V4F32	AKSIMD_UNPACKHI_V4F32 (const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
AkForceInline AKSIMD_V4I32	AKSIMD_PACKS_V4I32 (const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)

Detailed Description

AKSIMD - arm_neon implementation

Definition in file AkSimd.h.

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Get started with Wwise

Wwise SDK 2015.1.9

include/AK/SoundEngine/Platforms/arm_neon/AkSimd.h

include/AK/SoundEngine/Platforms/arm_neon/AkSimd.h File Reference

Defines

AKSIMD types

AKSIMD shuffling

AKSIMD arithmetic

AKSIMD packing / unpacking

Detailed Description

Was this page helpful?

Need Support?

Tell us about your project. We're here to help.