Wwise SDK 2022.1.4
_ak_simd_avx_8h_source
Version
menu_open
link
Wwise SDK 2022.1.4
|
AkSimdAvx.h
Go to the documentation of this file.
80 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
83 #define AKSIMD_SETV_V4F64( _d, _c, _b, _a ) _mm256_castpd_ps( _mm256_set_pd( (_d), (_c), (_b), (_a) ) )
95 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
96 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
97 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them
118 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
122 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) )
138 /// For each 128b lane, Swap the 2 lower floats together and the 2 higher floats together. ( h g f e d c b a -> g h e f c d a b )
139 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
141 /// For each 128b lane, Swap the 2 lower floats with the 2 higher floats. ( h g f e d c b a -> f e h g b a d c )
142 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
145 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
147 /// For each 128b lane, duplicates the odd items into the even items ( h g f e d c b a -> h h f f d d b b )
150 /// For each 128b lane, duplicates the even items into the odd items ( h g f e d c b a -> g g e e c c a a )
153 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in i, and return the results
154 #define AKSIMD_SHUFFLE_V8I32( a, b, i ) _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), i ))
156 /// single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
166 /// Selects the lower of each of the 128b lanes in a and b to be the result ( B A ), ( D C ) -> ( C A )
167 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0))
169 /// Selects the higher of each of the 128b lanes in a and b to be the result ( B A ), ( D C) -> ( D B )
170 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1))
180 AkForceInline void AKSIMD_TRANSPOSE8X4_V8F32(AKSIMD_V8F32& A, AKSIMD_V8F32& B, AKSIMD_V8F32& C, AKSIMD_V8F32& D)
209 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
222 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
234 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) )
268 /// horizontal add across the entire vector - vVec will be updated to contain the sum of every input element of vVec
283 /// Cross-platform SIMD multiplication of 8 complex data elements with interleaved real and imaginary parts
284 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(const AKSIMD_V8F32 cIn1, const AKSIMD_V8F32 cIn2)
287 __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1); // shuf multiplicand (c3, d3, c2, d2, c1, d1, c0, d0)
289 __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf); // temp (b3c3, b3d3, b2c2, b2d2, b1c1, b1d1, b0c0, b0d0)
291 __m256 out = _mm256_addsub_ps(mul, temp); // final (a3d3+b3c3, a3c3-b3d3, a2d2+b2c2, a2c2-b2d2, a1d1+b1c1, a1c1-b1d1, a0d0+b0c0, a0c0-b0d0)
335 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
336 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
342 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) )
345 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) )
347 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
373 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
376 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
377 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
378 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them
Was this page helpful?
Need Support?
Questions? Problems? Need more info? Contact us, and we can help!
Visit our Support pageTell us about your project. We're here to help.
Register your project and we'll help you get started with no strings attached!
Get started with Wwise