00001
00002
00003
00004
00006
00007
00008
00011
00012 #ifndef _AK_SIMD_SSE_H_
00013 #define _AK_SIMD_SSE_H_
00014
00015 #include <AK/SoundEngine/Common/AkTypes.h>
00016 #include <xmmintrin.h>
00017
00020
00021
00022 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
00023 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
00024
00025 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
00026
00028
00029
00032
00033 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00034
00035
00036
00039
00040
00041 typedef float AKSIMD_F32;
00042 typedef __m128 AKSIMD_V4F32;
00043 typedef AKSIMD_V4F32 AKSIMD_V4COND;
00044 typedef AKSIMD_V4F32 AKSIMD_V4FCOND;
00045 #define AKSIMD_V4F32_SUPPORTED
00046
00048
00049
00050
00053
00054
00056 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_load_ps( (AkReal32*)(__addr__) )
00057
00060 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
00061
00064 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
00065
00068 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
00069
00072 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
00073
00077 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
00078
00080
00081
00082
00085
00086
00089 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_store_ps( (AkReal32*)(__addr__), (__vec__) )
00090
00093 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
00094
00097 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
00098
00100
00101
00104
00105
00106
00107 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
00108
00111
00112 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
00113
00119 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
00120
00126 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
00127
00129 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
00130
00132 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
00133
00135 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
00136
00138 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00139
00141 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00142
00144
00145
00146
00149
00150
00153 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
00154
00158 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
00159
00162 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
00163
00167 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
00168
00171 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
00172
00173 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
00174
00179 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
00180
00182 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00183 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00184
00186 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
00187
00190 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
00191
00194 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
00195
00197 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
00198
00200 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
00201
00203 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
00204
00209 static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 & vVec)
00210 {
00211 __m128 vHighLow = _mm_movehl_ps(vVec, vVec);
00212 vVec = _mm_add_ps(vVec, vHighLow);
00213 vHighLow = _mm_shuffle_ps(vVec, vVec, 0x55);
00214 vVec = _mm_add_ps(vVec, vHighLow);
00215 }
00216
00217 static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns )
00218 {
00219 AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
00220 AKSIMD_HORIZONTALADD( vfDotProduct );
00221 return AKSIMD_SHUFFLE_V4F32( vfDotProduct, vfDotProduct, AKSIMD_SHUFFLE(0,0,0,0) );
00222 }
00223
00225 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00226 {
00227 static const AKSIMD_V4F32 vSign = { -1.f, 1.f, -1.f, 1.f };
00228
00229 AKSIMD_V4F32 vTmp1 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(2,2,0,0));
00230 vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00231 AKSIMD_V4F32 vTmp2 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(3,3,1,1));
00232 vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00233 vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
00234 return vTmp2;
00235 }
00236
00237 #ifdef AK_SSE3
00238
00239 #include <pmmintrin.h>
00240
00242 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00243 {
00244 AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1);
00245 vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2);
00246 AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1);
00247 AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1);
00248 xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1);
00249 AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2);
00250 return vCOut;
00251 }
00252
00253 #endif
00254
00255 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
00256 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
00257 #else
00258 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
00259 #endif
00260
00262
00263
00264
00267
00268
00270 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
00271
00273
00274
00275
00278
00279
00282 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
00283
00286 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
00287
00289
00290
00294
00295
00296 #define AKSIMD_CMP_CTRLMASK __m128
00297
00299 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
00300
00302 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
00303
00305 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
00306
00308 static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask )
00309 {
00310 vB = _mm_and_ps( vB, vMask );
00311 vA= _mm_andnot_ps( vMask, vA );
00312 return _mm_or_ps( vA, vB );
00313 }
00314
00315
00316 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
00317
00318
00319 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
00320
00321 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
00322
00324
00325
00326 #include <emmintrin.h>
00327
00328 typedef __m128i AKSIMD_V4I32;
00329
00330 typedef AKSIMD_V4I32 AKSIMD_V4ICOND;
00331
00333 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
00334
00336 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_load_si128( (__addr__) )
00337
00339 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
00340
00341 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
00342
00343 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
00344
00346 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_store_si128( (__addr__), (__vec__) )
00347
00350 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__addr__), (__vec__) )
00351
00354
00355
00358 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
00359
00362 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
00363
00366 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
00367
00370 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
00371
00374 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
00375
00377
00378
00381 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
00382
00385 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
00386
00389 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
00390
00393
00394
00397 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
00398 _mm_slli_epi32( (__vec__), (__shiftBy__) )
00399
00402 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
00403 _mm_srai_epi32( (__vec__), (__shiftBy__) )
00404
00406
00407
00408 #if defined( AK_CPU_X86 ) && !defined(AK_IOS) /// MMX
00409
00410 typedef __m64 AKSIMD_V2F32;
00411
00412 #endif
00413
00414
00415 #endif //_AK_SIMD_SSE_H_