Wwise SDK 2022.1.4
_platforms_2_s_s_e_2_ak_simd_8h_source
Version
menu_open
link
Wwise SDK 2022.1.4
|
AkSimd.h
Go to the documentation of this file.
47 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
48 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
50 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
141 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
145 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
149 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
153 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) _mm_store_sd( (AkReal64*)(__addr__), _mm_castps_pd(__vec__) )
170 #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
193 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
238 /// Vector multiply-add operation. (if we're targeting a platform or arch with FMA, (AVX2 implies FMA) using the fma intrinsics directly tends to be slightly more desirable)
243 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
244 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
248 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
296 static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns )
302 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
303 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
319 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
320 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
343 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_add_ps( a, _mm_xor_ps(b, AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f)))
348 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
385 static AkForceInline AKSIMD_V4I32 AKSIMD_MULLO_V4I32(const AKSIMD_V4I32 vIn1, const AKSIMD_V4I32 vIn2)
392 return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); // shuffle results to [63..0] and pack
421 static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
456 static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16* addr3, AkInt16* addr2, AkInt16* addr1, AkInt16* addr0)
471 _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
472 _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
507 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
508 static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask )
516 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
519 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
521 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
567 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
571 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
599 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
603 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
608 __m128i expMantShifted = _mm_srli_epi32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
611 __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
613 // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' the new inf-bits into the attempt at scaling from inf previously)
641 __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
645 __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23); // smallest float32 that yields a normalized float16
648 __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
651 __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000)); // test if the value will be greater than the max representable by float16
653 __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
654 __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
655 __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
656 __m128i infNanFloat = _mm_or_si128(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
658 __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
664 __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD); // move 16b ints (x,x,x,x,d,c,b,a) down to (x,x,x,x,x,x,d,b)
665 __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD); // move 16b ints (h,g,f,e,x,x,x,x) down to (x,x,h,f,x,x,x,x)
666 __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4); // combine - (x, x, h, f, x, x, d, b)
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Populates the full vector with the mask[3:0], setting each to 0 or ~0.
Definition: AkSimd.h:114
#define AKSIMD_SHUFFLE_BADC(__a__)
Swap the 2 lower floats together and the 2 higher floats together.
Definition: AkSimd.h:187
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add operation. (if we're targeting a platform or arch with FMA, (AVX2 implies FMA) us...
Definition: AkSimd.h:243
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:534
static AkForceInline AKSIMD_V4I32 AKSIMD_MULLO_V4I32(const AKSIMD_V4I32 vIn1, const AKSIMD_V4I32 vIn2)
Multiplies the low 32bits of a by b and stores it in V4I32 (no overflow)
Definition: AkSimd.h:385
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:287
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(AKSIMD_V4I32 vec)
Definition: AkSimd.h:605
static AkForceInline AKSIMD_V4F32 AKSIMD_CEIL_V4F32(const AKSIMD_V4F32 &x)
Rounds to upper value.
Definition: AkSimd.h:277
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:526
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:626
static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT(AKSIMD_V4F32 &vVec, const AKSIMD_V4F32 &vfSigns)
Definition: AkSimd.h:296
Definition: AkSimd.h:77
#define AKSIMD_XOR_V4F32(a, b)
Binary xor for single-precision floating-point.
Definition: AkSimd.h:274
Definition: AkSimd.h:73
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:456
static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32(AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:508
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:421
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:303
Was this page helpful?
Need Support?
Questions? Problems? Need more info? Contact us, and we can help!
Visit our Support pageTell us about your project. We're here to help.
Register your project and we'll help you get started with no strings attached!
Get started with Wwise