Table of Contents

include/AK/SoundEngine/Platforms/SSE/AkSimd.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
00003 released in source code form as part of the SDK installer package.
00004 
00005 Commercial License Usage
00006 
00007 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
00008 may use this file in accordance with the end user license agreement provided 
00009 with the software or, alternatively, in accordance with the terms contained in a
00010 written agreement between you and Audiokinetic Inc.
00011 
00012 Apache License Usage
00013 
00014 Alternatively, this file may be used under the Apache License, Version 2.0 (the 
00015 "Apache License"); you may not use this file except in compliance with the 
00016 Apache License. You may obtain a copy of the Apache License at 
00017 http://www.apache.org/licenses/LICENSE-2.0.
00018 
00019 Unless required by applicable law or agreed to in writing, software distributed
00020 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
00021 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
00022 the specific language governing permissions and limitations under the License.
00023 
00024   Version: <VERSION>  Build: <BUILDNUMBER>
00025   Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
00026 *******************************************************************************/
00027 
00028 // AkSimd.h
00029 
00030 /// \file 
00031 /// AKSIMD - SSE implementation
00032 
00033 #ifndef _AK_SIMD_SSE_H_
00034 #define _AK_SIMD_SSE_H_
00035 
00036 #include <AK/SoundEngine/Common/AkTypes.h>
00037 #include <xmmintrin.h>
00038 
00039 ////////////////////////////////////////////////////////////////////////
00040 /// @name Platform specific defines for prefetching
00041 //@{
00042 
00043 #define AKSIMD_ARCHCACHELINESIZE    (64)                ///< Assumed cache line width for architectures on this platform
00044 #define AKSIMD_ARCHMAXPREFETCHSIZE  (512)               ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)       
00045 /// Cross-platform memory prefetch of effective address assuming non-temporal data
00046 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA ) 
00047 
00048 //@}
00049 ////////////////////////////////////////////////////////////////////////
00050 
00051 ////////////////////////////////////////////////////////////////////////
00052 /// @name Platform specific memory size alignment for allocation purposes
00053 //@{
00054 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00055 //@}
00056 ////////////////////////////////////////////////////////////////////////
00057 
00058 ////////////////////////////////////////////////////////////////////////
00059 /// @name AKSIMD types
00060 //@{
00061 
00062 typedef float   AKSIMD_F32;     ///< 32-bit float
00063 typedef __m128  AKSIMD_V4F32;   ///< Vector of 4 32-bit floats
00064 typedef AKSIMD_V4F32 AKSIMD_V4COND;  ///< Vector of 4 comparison results
00065 typedef AKSIMD_V4F32 AKSIMD_V4FCOND;     ///< Vector of 4 comparison results
00066 
00067 //@}
00068 ////////////////////////////////////////////////////////////////////////
00069 
00070 
00071 ////////////////////////////////////////////////////////////////////////
00072 /// @name AKSIMD loading / setting
00073 //@{
00074 
00075 /// Loads four single-precision, floating-point values (see _mm_load_ps)
00076 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_load_ps( (AkReal32*)(__addr__) )
00077 
00078 /// Loads four single-precision floating-point values from unaligned
00079 /// memory (see _mm_loadu_ps)
00080 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
00081 
00082 /// Loads a single single-precision, floating-point value, copying it into
00083 /// all four words (see _mm_load1_ps, _mm_load_ps1)
00084 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
00085 
00086 /// Sets the four single-precision, floating-point values to in_value (see
00087 /// _mm_set1_ps, _mm_set_ps1)
00088 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
00089 
00090 /// Sets the four single-precision, floating-point values to zero (see
00091 /// _mm_setzero_ps)
00092 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
00093 
00094 /// Loads a single-precision, floating-point value into the low word
00095 /// and clears the upper three words.
00096 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
00097 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
00098 
00099 //@}
00100 ////////////////////////////////////////////////////////////////////////
00101 
00102 
00103 ////////////////////////////////////////////////////////////////////////
00104 /// @name AKSIMD storing
00105 //@{
00106 
00107 /// Stores four single-precision, floating-point values. The address
00108 /// must be 16-byte aligned (see _mm_store_ps)
00109 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_store_ps( (AkReal32*)(__addr__), (__vec__) )
00110 
00111 /// Stores four single-precision, floating-point values. The address
00112 /// does not need to be 16-byte aligned (see _mm_storeu_ps).
00113 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
00114 
00115 /// Stores the lower single-precision, floating-point value.
00116 /// *p := a0 (see _mm_store_ss)
00117 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
00118 
00119 //@}
00120 ////////////////////////////////////////////////////////////////////////
00121 
00122 ////////////////////////////////////////////////////////////////////////
00123 /// @name AKSIMD shuffling
00124 //@{
00125 
00126 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
00127 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
00128 
00129 /// Selects four specific single-precision, floating-point values from
00130 /// a and b, based on the mask i (see _mm_shuffle_ps)
00131 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
00132 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
00133 
00134 /// Moves the upper two single-precision, floating-point values of b to
00135 /// the lower two single-precision, floating-point values of the result.
00136 /// The upper two single-precision, floating-point values of a are passed
00137 /// through to the result.
00138 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
00139 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
00140 
00141 /// Moves the lower two single-precision, floating-point values of b to
00142 /// the upper two single-precision, floating-point values of the result.
00143 /// The lower two single-precision, floating-point values of a are passed
00144 /// through to the result.
00145 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
00146 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
00147 
00148 /// Swap the 2 lower floats together and the 2 higher floats together.  
00149 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
00150 
00151 /// Swap the 2 lower floats with the 2 higher floats.   
00152 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
00153 
00154 /// Barrel-shift all floats by one.
00155 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
00156 
00157 /// Duplicates the odd items into the even items (d c b a -> d d b b )
00158 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00159 
00160 /// Duplicates the even items into the odd items (d c b a -> c c a a )
00161 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00162 
00163 //@}
00164 ////////////////////////////////////////////////////////////////////////
00165 
00166 
00167 ////////////////////////////////////////////////////////////////////////
00168 /// @name AKSIMD arithmetic
00169 //@{
00170 
00171 /// Subtracts the four single-precision, floating-point values of
00172 /// a and b (a - b) (see _mm_sub_ps)
00173 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
00174 
00175 /// Subtracts the lower single-precision, floating-point values of a and b.
00176 /// The upper three single-precision, floating-point values are passed through from a.
00177 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
00178 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
00179 
00180 /// Adds the four single-precision, floating-point values of
00181 /// a and b (see _mm_add_ps)
00182 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
00183 
00184 /// Adds the lower single-precision, floating-point values of a and b; the
00185 /// upper three single-precision, floating-point values are passed through from a.
00186 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
00187 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
00188 
00189 /// Multiplies the four single-precision, floating-point values
00190 /// of a and b (see _mm_mul_ps)
00191 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
00192 
00193 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
00194 
00195 /// Multiplies the lower single-precision, floating-point values of
00196 /// a and b; the upper three single-precision, floating-point values
00197 /// are passed through from a.
00198 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
00199 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
00200 
00201 /// Vector multiply-add operation.
00202 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00203 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00204 
00205 /// Vector multiply-add operation.
00206 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
00207 
00208 /// Computes the minima of the four single-precision, floating-point
00209 /// values of a and b (see _mm_min_ps)
00210 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
00211 
00212 /// Computes the maximums of the four single-precision, floating-point
00213 /// values of a and b (see _mm_max_ps)
00214 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
00215 
00216 /// Computes the absolute value
00217 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
00218 
00219 /// Changes the sign
00220 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
00221 
00222 /// Vector square root aproximation (see _mm_sqrt_ps)
00223 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
00224 
00225 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
00226 #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
00227 
00228 /// Faked in-place vector horizontal add. 
00229 /// \akwarning
00230 /// Don't expect this to be very efficient. 
00231 /// \endakwarning
00232 static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 & vVec)
00233 {   
00234     __m128 vHighLow = _mm_movehl_ps(vVec, vVec);
00235     vVec = _mm_add_ps(vVec, vHighLow);
00236     vHighLow = _mm_shuffle_ps(vVec, vVec, 0x55);
00237     vVec = _mm_add_ps(vVec, vHighLow);
00238 } 
00239 
00240 static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns )
00241 {
00242     AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
00243     AKSIMD_HORIZONTALADD( vfDotProduct );
00244     return AKSIMD_SHUFFLE_V4F32( vfDotProduct, vfDotProduct, AKSIMD_SHUFFLE(0,0,0,0) );
00245 }
00246 
00247 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
00248 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00249 {
00250     static const AKSIMD_V4F32 vSign = { -1.f, 1.f, -1.f, 1.f }; 
00251 
00252     AKSIMD_V4F32 vTmp1 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(2,2,0,0)); 
00253     vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00254     AKSIMD_V4F32 vTmp2 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(3,3,1,1)); 
00255     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00256     vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
00257     return vTmp2;
00258 }
00259 
00260 #ifdef AK_SSE3
00261 
00262 #include <pmmintrin.h>
00263 
00264 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
00265 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00266 {
00267     AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1);    // multiplier real  (a1,   a1,   a0,   a0) 
00268     vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2);         // temp1            (a1d1, a1c1, a0d0, a0c0) 
00269     AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1,   d1,   c0,   d0)  
00270     AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1);     // multiplier imag  (b1,   b1,   b0,   b0) 
00271     xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1);           // temp2            (b1c1, b1d1, b0c0, b0d0) 
00272     AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2);        // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0 
00273     return vCOut;
00274 }
00275 
00276 #endif
00277 
00278 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
00279     #define AKSIMD_ASSERTFLUSHZEROMODE  AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
00280 #else
00281     #define AKSIMD_ASSERTFLUSHZEROMODE  AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
00282 #endif
00283 
00284 //@}
00285 ////////////////////////////////////////////////////////////////////////
00286 
00287 
00288 ////////////////////////////////////////////////////////////////////////
00289 /// @name AKSIMD integer arithmetic
00290 //@{
00291 
00292 /// Adds the four integer values of a and b
00293 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
00294 
00295 #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
00296 #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
00297 #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
00298 #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
00299 #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
00300 
00301 /// Multiplies the low 16bits of a by b and stores it in V4I32 (no overflow)
00302 #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
00303 //@}
00304 ////////////////////////////////////////////////////////////////////////
00305 
00306 
00307 ////////////////////////////////////////////////////////////////////////
00308 /// @name AKSIMD packing / unpacking
00309 //@{
00310 
00311 /// Selects and interleaves the lower two single-precision, floating-point
00312 /// values from a and b (see _mm_unpacklo_ps)
00313 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
00314 
00315 /// Selects and interleaves the upper two single-precision, floating-point
00316 /// values from a and b (see _mm_unpackhi_ps)
00317 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
00318 
00319 //@}
00320 ////////////////////////////////////////////////////////////////////////
00321 
00322 ////////////////////////////////////////////////////////////////////////
00323 /// @name AKSIMD vector comparison
00324 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms. 
00325 //@{
00326 
00327 #define AKSIMD_CMP_CTRLMASK __m128
00328 
00329 /// Vector "<=" operation (see _mm_cmple_ps)
00330 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
00331 
00332 #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
00333 
00334 /// Vector ">=" operation (see _mm_cmple_ps)
00335 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
00336 
00337 #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
00338 
00339 /// Vector "==" operation (see _mm_cmpeq_ps)
00340 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
00341 
00342 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
00343 static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask )
00344 {
00345     vB = _mm_and_ps( vB, vMask );
00346     vA= _mm_andnot_ps( vMask, vA );
00347     return _mm_or_ps( vA, vB );
00348 }
00349 
00350 // (cond1 >= cond2) ? b : a.
00351 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
00352 
00353 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
00354 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
00355 
00356 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
00357 
00358 #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
00359 
00360 //@}
00361 ////////////////////////////////////////////////////////////////////////
00362 
00363 #include <emmintrin.h>
00364 
00365 typedef __m128i AKSIMD_V4I32;   ///< Vector of 4 32-bit signed integers
00366 
00367 typedef AKSIMD_V4I32 AKSIMD_V4ICOND;
00368 
00369 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
00370 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
00371 
00372 /// Loads aligned 128-bit value (see _mm_loadu_si128)
00373 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_load_si128( (__addr__) )
00374 
00375 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
00376 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
00377 
00378 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
00379 
00380 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
00381 
00382 /// Stores four 32-bit integer values. 
00383 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_store_si128( (__addr__), (__vec__) )
00384 
00385 /// Stores four 32-bit integer values. The address
00386 /// does not need to be 16-byte aligned (see _mm_storeu_si128).
00387 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__addr__), (__vec__) )
00388 
00389 ////////////////////////////////////////////////////////////////////////
00390 /// @name AKSIMD conversion
00391 //@{
00392 
00393 /// Converts the four signed 32-bit integer values of a to single-precision,
00394 /// floating-point values (see _mm_cvtepi32_ps)
00395 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
00396 
00397 /// Converts the four single-precision, floating-point values of a to signed
00398 /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
00399 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
00400 
00401 /// Converts the four single-precision, floating-point values of a to signed
00402 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
00403 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
00404 
00405 /// Computes the bitwise AND of the 128-bit value in a and the
00406 /// 128-bit value in b (see _mm_and_si128)
00407 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
00408 
00409 /// Compares the 8 signed 16-bit integers in a and the 8 signed
00410 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
00411 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
00412 
00413 //@}
00414 ////////////////////////////////////////////////////////////////////////
00415 
00416 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
00417 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
00418 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
00419 
00420 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
00421 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
00422 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
00423 
00424 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
00425 /// integers and saturates (see _mm_packs_epi32)
00426 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
00427 
00428 ////////////////////////////////////////////////////////////////////////
00429 /// @name AKSIMD shifting
00430 //@{
00431 
00432 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
00433 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
00434 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
00435     _mm_slli_epi32( (__vec__), (__shiftBy__) )
00436 
00437 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
00438 /// bits while shifting in the sign bit (see _mm_srai_epi32)
00439 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
00440     _mm_srai_epi32( (__vec__), (__shiftBy__) )
00441 
00442 //@}
00443 ////////////////////////////////////////////////////////////////////////
00444 
00445 #if defined( AK_CPU_X86 ) && !defined(AK_IOS) /// MMX
00446 
00447 typedef __m64   AKSIMD_V2F32;   ///< Vector of 2 32-bit floats
00448 
00449 #endif
00450 
00451 
00452 #endif //_AK_SIMD_SSE_H_