Wwise SDK 2019.2.15
_platforms_2arm__neon_2_ak_simd_8h_source
Version
menu_open
link
Wwise SDK 2019.2.15
|
AkSimd.h
Go to the documentation of this file.
44 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
45 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
47 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) __builtin_prefetch(((char *)(__add__))+(__offset__))
151 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
169 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
182 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
183 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
194 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
196 /// Stores four single-precision, floating-point values. The address does not need to be 16-byte aligned.
201 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
209 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
213 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
216 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
217 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
254 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
258 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
263 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
284 uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
286 // Determine if value is denorm or not, and use that to determine how much exponent should be scaled by, and if we need post-scale fp adjustment
288 uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
293 uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
312 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
324 uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
328 return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
343 uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
347 uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23); // smallest float32 that yields a normalized float16
353 uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000)); // test if exponent bits are zero or not
355 uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
356 uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
357 uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
358 uint32x4_t infNanData = vorrq_u32(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
360 uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData); // and combine the results
420 vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
433 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
441 static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
501 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
504 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
535 //#define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
539 //#define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
568 vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
596 vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
628 vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
633 /// Vector multiply-substract operation. Careful: vmlsq_f32 does c-(a*b) and not the expected (a*b)-c
644 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
648 AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32( const AKSIMD_V4F32& __a__, const AKSIMD_V4F32& __b__, const AKSIMD_V4F32& __c__ )
701 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
702 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32( AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2 )
710 float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2); // val[0] will be reals extended, val[1] will be imag
728 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
732 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
736 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
748 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
760 AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
770 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
790 static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
821 static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
875 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
879 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
882 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
890 static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
913 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
917 int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
926 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32(const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
Vector multiply-add operation.
Definition: AkSimd.h:648
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(uint16x4_t vecs16)
Definition: AkSimd.h:261
static AkForceInline AKSIMD_V4I32 AKSIMD_ROUND_V4F32_TO_V4I32(AKSIMD_V4F32 a)
Definition: AkSimd.h:233
#define AKSIMD_SET_V4I32(__scalar__)
Sets the four integer values to scalar
Definition: AkSimd.h:111
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:760
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:702
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:821
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:790
static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a)
Definition: AkSimd.h:113
static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:441
Definition: AkSimd.h:45
Definition: AkSimd.h:49
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:692
AKSIMD_V4F32 AKSIMD_MOVELH_V4F32(const AKSIMD_V4F32 &xyzw, const AKSIMD_V4F32 &abcd)
Definition: AkSimd.h:529
#define AKSIMD_EQ_V4I32(__a__, __b__)
Compare each integer element and return control mask.
Definition: AkSimd.h:873
Definition: AkSimd.h:48
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:736
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:607
Definition: AkSimd.h:75
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:924
Definition: AkSimd.h:71
Definition: AkSimd.h:46
AKSIMD_V4F32 AKSIMD_MOVEHL_V4F32(const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw)
Definition: AkSimd.h:515
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:748
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Definition: AkSimd.h:938
static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:433
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:911
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4UI32 &in_vec1)
Definition: AkSimd.h:886
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:310
static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a)
Definition: AkSimd.h:122
Was this page helpful?
Need Support?
Questions? Problems? Need more info? Contact us, and we can help!
Visit our Support pageTell us about your project. We're here to help.
Register your project and we'll help you get started with no strings attached!
Get started with Wwise