00001 00002 // 00003 // Copyright (c) 2006 Audiokinetic Inc. / All Rights Reserved 00004 // 00006 00007 // AkSimd.h 00008 00011 00012 #ifndef _AK_SIMD_SSE_H_ 00013 #define _AK_SIMD_SSE_H_ 00014 00015 #include <AK/SoundEngine/Common/AkTypes.h> 00016 #include <xmmintrin.h> 00017 00020 00021 00022 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform 00023 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache) 00024 00025 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA ) 00026 00028 00029 00032 00033 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15) 00034 00035 00036 00039 00040 00041 typedef float AKSIMD_F32; 00042 typedef __m128 AKSIMD_V4F32; 00043 typedef AKSIMD_V4F32 AKSIMD_V4COND; 00044 typedef AKSIMD_V4F32 AKSIMD_V4FCOND; 00045 #define AKSIMD_V4F32_SUPPORTED 00046 00048 00049 00050 00053 00054 00056 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_load_ps( (AkReal32*)(__addr__) ) 00057 00060 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) ) 00061 00064 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) ) 00065 00068 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) ) 00069 00072 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps() 00073 00077 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) ) 00078 00080 00081 00082 00085 00086 00089 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_store_ps( (AkReal32*)(__addr__), (__vec__) ) 00090 00093 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) ) 00094 00097 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) ) 00098 00100 00101 00104 00105 00106 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE) 00107 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) ) 00108 00111 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) ) 00112 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i ) 00113 00119 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b ) 00120 00126 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b ) 00127 00129 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1)) 00130 00132 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2)) 00133 00135 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1)) 00136 00138 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1)) 00139 00141 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0)) 00142 00144 00145 00146 00149 00150 00153 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b ) 00154 00158 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b ) 00159 00162 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b ) 00163 00167 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b ) 00168 00171 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b ) 00172 00173 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b ) 00174 00179 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b ) 00180 00182 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) ) 00183 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) ) 00184 00186 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) ) 00187 00190 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b ) 00191 00194 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b ) 00195 00197 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a) 00198 00200 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__) 00201 00203 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) ) 00204 00209 static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 & vVec) 00210 { 00211 __m128 vHighLow = _mm_movehl_ps(vVec, vVec); 00212 vVec = _mm_add_ps(vVec, vHighLow); 00213 vHighLow = _mm_shuffle_ps(vVec, vVec, 0x55); 00214 vVec = _mm_add_ps(vVec, vHighLow); 00215 } 00216 00217 static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns ) 00218 { 00219 AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns ); 00220 AKSIMD_HORIZONTALADD( vfDotProduct ); 00221 return AKSIMD_SHUFFLE_V4F32( vfDotProduct, vfDotProduct, AKSIMD_SHUFFLE(0,0,0,0) ); 00222 } 00223 00225 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 ) 00226 { 00227 static const AKSIMD_V4F32 vSign = { -1.f, 1.f, -1.f, 1.f }; 00228 00229 AKSIMD_V4F32 vTmp1 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(2,2,0,0)); 00230 vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 ); 00231 AKSIMD_V4F32 vTmp2 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(3,3,1,1)); 00232 vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign ); 00233 vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 ); 00234 return vTmp2; 00235 } 00236 00237 #ifdef AK_SSE3 00238 00239 #include <pmmintrin.h> 00240 00242 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 ) 00243 { 00244 AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1); // multiplier real (a1, a1, a0, a0) 00245 vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2); // temp1 (a1d1, a1c1, a0d0, a0c0) 00246 AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1, d1, c0, d0) 00247 AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1); // multiplier imag (b1, b1, b0, b0) 00248 xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1); // temp2 (b1c1, b1d1, b0c0, b0d0) 00249 AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2); // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0 00250 return vCOut; 00251 } 00252 00253 #endif 00254 00255 #if defined _MSC_VER && ( _MSC_VER <= 1600 ) 00256 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON ) 00257 #else 00258 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON ) 00259 #endif 00260 00262 00263 00264 00267 00268 00270 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b ) 00271 00273 00274 00275 00278 00279 00282 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b ) 00283 00286 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b ) 00287 00289 00290 00294 00295 00296 #define AKSIMD_CMP_CTRLMASK __m128 00297 00299 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) ) 00300 00302 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) ) 00303 00305 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) ) 00306 00308 static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask ) 00309 { 00310 vB = _mm_and_ps( vB, vMask ); 00311 vA= _mm_andnot_ps( vMask, vA ); 00312 return _mm_or_ps( vA, vB ); 00313 } 00314 00315 // (cond1 >= cond2) ? b : a. 00316 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) ) 00317 00318 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax. 00319 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) ) 00320 00321 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx)) 00322 00324 00325 00326 #include <emmintrin.h> 00327 00328 typedef __m128i AKSIMD_V4I32; 00329 00330 typedef AKSIMD_V4I32 AKSIMD_V4ICOND; 00331 00333 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) ) 00334 00336 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_load_si128( (__addr__) ) 00337 00339 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128() 00340 00341 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) ) 00342 00343 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) ) 00344 00346 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_store_si128( (__addr__), (__vec__) ) 00347 00350 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__addr__), (__vec__) ) 00351 00354 00355 00358 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) ) 00359 00362 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) ) 00363 00366 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) ) 00367 00370 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) ) 00371 00374 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) ) 00375 00377 00378 00381 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b ) 00382 00385 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b ) 00386 00389 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b ) 00390 00393 00394 00397 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \ 00398 _mm_slli_epi32( (__vec__), (__shiftBy__) ) 00399 00402 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \ 00403 _mm_srai_epi32( (__vec__), (__shiftBy__) ) 00404 00406 00407 00408 #if defined( AK_CPU_X86 ) && !defined(AK_IOS) /// MMX 00409 00410 typedef __m64 AKSIMD_V2F32; 00411 00412 #endif 00413 00414 00415 #endif //_AK_SIMD_SSE_H_
Questions? Problems? Need more info? Contact us, and we can help!
Visit our Support pageRegister your project and we'll help you get started with no strings attached!
Get started with Wwise