目录

include/AK/SoundEngine/Platforms/SSE/AkSimd.h

Go to the documentation of this file.
00001 
00002 //
00003 // Copyright (c) 2006 Audiokinetic Inc. / All Rights Reserved
00004 //
00006 
00007 // AkSimd.h
00008 
00011 
00012 #ifndef _AK_SIMD_SSE_H_
00013 #define _AK_SIMD_SSE_H_
00014 
00015 #include <AK/SoundEngine/Common/AkTypes.h>
00016 #include <xmmintrin.h>
00017 
00020 
00021 
00022 #define AKSIMD_ARCHCACHELINESIZE    (64)                ///< Assumed cache line width for architectures on this platform
00023 #define AKSIMD_ARCHMAXPREFETCHSIZE  (512)               ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)       
00024 
00025 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA ) 
00026 
00028 
00029 
00032 
00033 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00034 
00035 
00036 
00039 
00040 
00041 typedef float   AKSIMD_F32;     
00042 typedef __m128  AKSIMD_V4F32;   
00043 typedef AKSIMD_V4F32 AKSIMD_V4COND;  
00044 typedef AKSIMD_V4F32 AKSIMD_V4FCOND;     
00045 #define AKSIMD_V4F32_SUPPORTED
00046 
00048 
00049 
00050 
00053 
00054 
00056 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_load_ps( (AkReal32*)(__addr__) )
00057 
00060 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
00061 
00064 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
00065 
00068 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
00069 
00072 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
00073 
00077 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
00078 
00080 
00081 
00082 
00085 
00086 
00089 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_store_ps( (AkReal32*)(__addr__), (__vec__) )
00090 
00093 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
00094 
00097 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
00098 
00100 
00101 
00104 
00105 
00106 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
00107 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
00108 
00111 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
00112 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
00113 
00119 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
00120 
00126 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
00127 
00129 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
00130 
00132 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
00133 
00135 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
00136 
00138 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00139 
00141 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00142 
00144 
00145 
00146 
00149 
00150 
00153 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
00154 
00158 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
00159 
00162 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
00163 
00167 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
00168 
00171 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
00172 
00173 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
00174 
00179 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
00180 
00182 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00183 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00184 
00186 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
00187 
00190 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
00191 
00194 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
00195 
00197 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
00198 
00200 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
00201 
00203 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
00204 
00209 static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 & vVec)
00210 {   
00211     __m128 vHighLow = _mm_movehl_ps(vVec, vVec);
00212     vVec = _mm_add_ps(vVec, vHighLow);
00213     vHighLow = _mm_shuffle_ps(vVec, vVec, 0x55);
00214     vVec = _mm_add_ps(vVec, vHighLow);
00215 } 
00216 
00217 static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns )
00218 {
00219     AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
00220     AKSIMD_HORIZONTALADD( vfDotProduct );
00221     return AKSIMD_SHUFFLE_V4F32( vfDotProduct, vfDotProduct, AKSIMD_SHUFFLE(0,0,0,0) );
00222 }
00223 
00225 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00226 {
00227     static const AKSIMD_V4F32 vSign = { -1.f, 1.f, -1.f, 1.f }; 
00228 
00229     AKSIMD_V4F32 vTmp1 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(2,2,0,0)); 
00230     vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00231     AKSIMD_V4F32 vTmp2 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(3,3,1,1)); 
00232     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00233     vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
00234     return vTmp2;
00235 }
00236 
00237 #ifdef AK_SSE3
00238 
00239 #include <pmmintrin.h>
00240 
00242 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00243 {
00244     AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1);    // multiplier real  (a1,   a1,   a0,   a0) 
00245     vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2);         // temp1            (a1d1, a1c1, a0d0, a0c0) 
00246     AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1,   d1,   c0,   d0)  
00247     AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1);     // multiplier imag  (b1,   b1,   b0,   b0) 
00248     xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1);           // temp2            (b1c1, b1d1, b0c0, b0d0) 
00249     AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2);        // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0 
00250     return vCOut;
00251 }
00252 
00253 #endif
00254 
00255 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
00256     #define AKSIMD_ASSERTFLUSHZEROMODE  AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
00257 #else
00258     #define AKSIMD_ASSERTFLUSHZEROMODE  AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
00259 #endif
00260 
00262 
00263 
00264 
00267 
00268 
00270 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
00271 
00273 
00274 
00275 
00278 
00279 
00282 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
00283 
00286 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
00287 
00289 
00290 
00294 
00295 
00296 #define AKSIMD_CMP_CTRLMASK __m128
00297 
00299 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
00300 
00302 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
00303 
00305 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
00306 
00308 static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask )
00309 {
00310     vB = _mm_and_ps( vB, vMask );
00311     vA= _mm_andnot_ps( vMask, vA );
00312     return _mm_or_ps( vA, vB );
00313 }
00314 
00315 // (cond1 >= cond2) ? b : a.
00316 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
00317 
00318 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
00319 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
00320 
00321 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
00322 
00324 
00325 
00326 #include <emmintrin.h>
00327 
00328 typedef __m128i AKSIMD_V4I32;   
00329 
00330 typedef AKSIMD_V4I32 AKSIMD_V4ICOND;
00331 
00333 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
00334 
00336 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_load_si128( (__addr__) )
00337 
00339 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
00340 
00341 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
00342 
00343 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
00344 
00346 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_store_si128( (__addr__), (__vec__) )
00347 
00350 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__addr__), (__vec__) )
00351 
00354 
00355 
00358 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
00359 
00362 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
00363 
00366 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
00367 
00370 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
00371 
00374 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
00375 
00377 
00378 
00381 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
00382 
00385 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
00386 
00389 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
00390 
00393 
00394 
00397 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
00398     _mm_slli_epi32( (__vec__), (__shiftBy__) )
00399 
00402 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
00403     _mm_srai_epi32( (__vec__), (__shiftBy__) )
00404 
00406 
00407 
00408 #if defined( AK_CPU_X86 ) && !defined(AK_IOS)   /// MMX
00409 
00410 typedef __m64   AKSIMD_V2F32;   
00411 
00412 #endif
00413 
00414 
00415 #endif //_AK_SIMD_SSE_H_