目录

include/AK/SoundEngine/Platforms/Generic/AkSimd.h

Go to the documentation of this file.
00001 
00002 //
00003 // Copyright (c) 2006 Audiokinetic Inc. / All Rights Reserved
00004 //
00006 
00007 // AkSimd.h
00008 
00011 
00012 #ifndef _AKSIMD_GENERIC_H_
00013 #define _AKSIMD_GENERIC_H_
00014 
00015 #include <math.h>
00016 #include <string.h>
00017 #include <AK/SoundEngine/Common/AkTypes.h>
00018 #include <AK/Tools/Common/AkPlatformFuncs.h>
00019 
00022 
00023 typedef AkInt32 AKSIMD_I32;                                 
00024 typedef struct { AkInt32 m_data[4]; } AKSIMD_V4I32;         
00025 typedef struct { AkUInt32 m_data[4]; } AKSIMD_V4UI32;       
00026 typedef AkReal32 AKSIMD_F32;                                
00027 typedef struct { AkReal32 m_data[2]; } AKSIMD_V2F32;        
00028 typedef struct { AkReal32 m_data[4]; } AKSIMD_V4F32;        
00029 typedef AKSIMD_V4UI32   AKSIMD_V4COND;                      
00030 
00031 
00032 typedef struct { AkInt32 m_data[4]; }  __attribute__((__packed__)) AKSIMD_V4I32_UNALIGNED;      
00033 typedef struct { AkUInt32 m_data[4]; } __attribute__((__packed__)) AKSIMD_V4UI32_UNALIGNED;     
00034 typedef struct { AkReal32 m_data[2]; } __attribute__((__packed__)) AKSIMD_V2F32_UNALIGNED;      
00035 typedef struct { AkReal32 m_data[4]; } __attribute__((__packed__)) AKSIMD_V4F32_UNALIGNED;      
00036 
00038 
00039 
00040 #ifndef AKSIMD_GETELEMENT_V4F32
00041 #define AKSIMD_GETELEMENT_V4F32( __vName, __num__ )             (__vName).m_data[(__num__)]
00042 #endif
00043 
00044 #ifndef AKSIMD_GETELEMENT_V2F32
00045 #define AKSIMD_GETELEMENT_V2F32( __vName, __num__ )             (__vName).m_data[(__num__)]
00046 #endif
00047 
00048 #ifndef AKSIMD_GETELEMENT_V4I32
00049 #define AKSIMD_GETELEMENT_V4I32( __vName, __num__ )             (__vName).m_data[(__num__)]
00050 #endif
00051 
00054 
00055 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00056 
00057 
00058 
00061 
00062 #define AKSIMD_LOADU_V4I32( in_pData ) (*(in_pData))
00063 
00064 #define AKSIMD_LOADU_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
00065 
00066 #define AKSIMD_LOAD_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
00067 
00068 AkForceInline AKSIMD_V4F32 AKSIMD_LOAD1_V4F32( AKSIMD_F32 in_value )
00069 {
00070     AKSIMD_V4F32 vector;
00071     vector.m_data[0] = in_value;
00072     vector.m_data[1] = in_value;
00073     vector.m_data[2] = in_value;
00074     vector.m_data[3] = in_value;
00075     
00076     return vector;
00077 }
00078 
00079 // _mm_set_ps1
00080 AkForceInline AKSIMD_V4F32 AKSIMD_SET_V4F32( AKSIMD_F32 in_value )
00081 {
00082     AKSIMD_V4F32 vector;
00083     vector.m_data[0] = in_value;
00084     vector.m_data[1] = in_value;
00085     vector.m_data[2] = in_value;
00086     vector.m_data[3] = in_value;
00087     
00088     return vector;
00089 }
00090 
00091 
00092 AkForceInline AKSIMD_V2F32 AKSIMD_SET_V2F32( AKSIMD_F32 in_value )
00093 {
00094     AKSIMD_V2F32 vector;
00095     vector.m_data[0] = in_value;
00096     vector.m_data[1] = in_value;
00097     
00098     return vector;
00099 }
00100 
00101 // _mm_setzero_ps()
00102 AkForceInline AKSIMD_V4F32 AKSIMD_SETZERO_V4F32()
00103 {
00104     AKSIMD_V4F32 vector;
00105     vector.m_data[0] = 0.f;
00106     vector.m_data[1] = 0.f;
00107     vector.m_data[2] = 0.f;
00108     vector.m_data[3] = 0.f;
00109     
00110     return vector;
00111 }
00112 
00113 AkForceInline AKSIMD_V2F32 AKSIMD_SETZERO_V2F32()
00114 {
00115     AKSIMD_V2F32 vector;
00116     vector.m_data[0] = 0.f;
00117     vector.m_data[1] = 0.f;
00118     
00119     return vector;
00120 }
00121 // _mm_setzero_si128()
00122 AkForceInline AKSIMD_V4I32 AKSIMD_SETZERO_V4I32()
00123 {
00124     AKSIMD_V4I32 vector;
00125     vector.m_data[0] = 0;
00126     vector.m_data[1] = 0;
00127     vector.m_data[2] = 0;
00128     vector.m_data[3] = 0;
00129     
00130     return vector;
00131 }
00132 
00133 
00137 AkForceInline AKSIMD_V4F32 AKSIMD_LOAD_SS_V4F32( const AKSIMD_F32* in_pData )
00138 {
00139     AKSIMD_V4F32 vector;
00140     vector.m_data[0] = *in_pData;
00141     vector.m_data[1] = 0.f;
00142     vector.m_data[2] = 0.f;
00143     vector.m_data[3] = 0.f;
00144     
00145     return vector;
00146 }
00147 
00149 
00150 
00153 
00154 
00155 // _mm_storeu_ps -- The address does not need to be 16-byte aligned.
00156 #define AKSIMD_STOREU_V4F32( in_pTo, in_vec ) (*(AKSIMD_V4F32*)(in_pTo)) = (in_vec)
00157 
00158 // _mm_store_ps -- The address must be 16-byte aligned.
00159 // ????? _mm_storeu_ps vs _mm_store_ps ?????
00160 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) AKSIMD_STOREU_V4F32(__addr__, __vName__)
00161 
00162 // _mm_storeu_si128
00163 #define AKSIMD_STOREU_V4I32( in_pTo, in_vec ) (*(AKSIMD_V4I32*)(in_pTo)) = (in_vec)
00164 
00167 AkForceInline void AKSIMD_STORE1_V4F32( AKSIMD_F32* in_pTo, const AKSIMD_V4F32& in_vec )
00168 {
00169     ((AKSIMD_V4F32*)in_pTo)->m_data[0] = in_vec.m_data[0];
00170 }
00171 
00173 
00174 
00177 
00178 
00179 // _mm_cvtepi32_ps
00180 AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4I32_TO_V4F32( const AKSIMD_V4I32& in_from )
00181 {
00182     AKSIMD_V4F32 vector;
00183     vector.m_data[0] = (AkReal32)in_from.m_data[0];
00184     vector.m_data[1] = (AkReal32)in_from.m_data[1];
00185     vector.m_data[2] = (AkReal32)in_from.m_data[2];
00186     vector.m_data[3] = (AkReal32)in_from.m_data[3];
00187     
00188     return vector;
00189 }
00190 // _mm_cvtps_epi32
00191 AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4I32( const AKSIMD_V4F32& in_from )
00192 {
00193     AKSIMD_V4I32 vector;
00194     vector.m_data[0] = (AkInt32)in_from.m_data[0];
00195     vector.m_data[1] = (AkInt32)in_from.m_data[1];
00196     vector.m_data[2] = (AkInt32)in_from.m_data[2];
00197     vector.m_data[3] = (AkInt32)in_from.m_data[3];
00198     
00199     return vector;
00200 }
00201 
00203 
00204 
00207 
00208 
00209 // _mm_and_si128
00210 AkForceInline AKSIMD_V4I32 AKSIMD_AND_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00211 {
00212     AKSIMD_V4I32 vector;
00213     vector.m_data[0] = in_vec1.m_data[0] & in_vec2.m_data[0];
00214     vector.m_data[1] = in_vec1.m_data[1] & in_vec2.m_data[1];
00215     vector.m_data[2] = in_vec1.m_data[2] & in_vec2.m_data[2];
00216     vector.m_data[3] = in_vec1.m_data[3] & in_vec2.m_data[3];
00217     
00218     return vector;
00219 }
00220 
00223 AkForceInline AKSIMD_V4I32 AKSIMD_CMPGT_V8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00224 {
00225     AKSIMD_V4I32 vector;
00226     
00227     AkInt16 *pVec1,*pVec2,*pVec3;
00228     pVec1 = (AkInt16*)&in_vec1;
00229     pVec2 = (AkInt16*)&in_vec2;
00230     pVec3 = (AkInt16*)&vector;
00231     
00232     pVec3[0] = (pVec1[0] > pVec2[0]) ? 0xffff : 0x0;
00233     pVec3[1] = (pVec1[1] > pVec2[1]) ? 0xffff : 0x0;
00234     pVec3[2] = (pVec1[2] > pVec2[2]) ? 0xffff : 0x0;
00235     pVec3[3] = (pVec1[3] > pVec2[3]) ? 0xffff : 0x0;
00236     pVec3[4] = (pVec1[4] > pVec2[4]) ? 0xffff : 0x0;
00237     pVec3[5] = (pVec1[5] > pVec2[5]) ? 0xffff : 0x0;
00238     pVec3[6] = (pVec1[6] > pVec2[6]) ? 0xffff : 0x0;
00239     pVec3[7] = (pVec1[7] > pVec2[7]) ? 0xffff : 0x0;
00240 
00241     return vector;
00242 }
00243 
00245 AkForceInline AKSIMD_V4UI32 AKSIMD_CMPLE_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00246 {
00247     AKSIMD_V4UI32 vector;
00248     
00249     vector.m_data[0] = (in_vec1.m_data[0] <= in_vec2.m_data[0]) ? 0xffffffff : 0x0;
00250     vector.m_data[1] = (in_vec1.m_data[1] <= in_vec2.m_data[1]) ? 0xffffffff : 0x0;
00251     vector.m_data[2] = (in_vec1.m_data[2] <= in_vec2.m_data[2]) ? 0xffffffff : 0x0;
00252     vector.m_data[3] = (in_vec1.m_data[3] <= in_vec2.m_data[3]) ? 0xffffffff : 0x0;
00253     
00254     return vector;
00255 }
00256 
00257 
00258 AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTLEFT_V4I32( AKSIMD_V4I32 in_vector, int in_shiftBy)
00259 {
00260     in_vector.m_data[0] <<= in_shiftBy;
00261     in_vector.m_data[1] <<= in_shiftBy;
00262     in_vector.m_data[2] <<= in_shiftBy;
00263     in_vector.m_data[3] <<= in_shiftBy;
00264     
00265     return in_vector;
00266 }
00267 
00268 AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTRIGHTARITH_V4I32( AKSIMD_V4I32 in_vector, int in_shiftBy)
00269 {
00270     in_vector.m_data[0] >>= in_shiftBy;
00271     in_vector.m_data[1] >>= in_shiftBy;
00272     in_vector.m_data[2] >>= in_shiftBy;
00273     in_vector.m_data[3] >>= in_shiftBy;
00274     
00275     return in_vector;
00276 }
00277 
00279 
00280 
00281 
00284 
00285 // _mm_sub_ps
00286 AkForceInline AKSIMD_V4F32 AKSIMD_SUB_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00287 {
00288     AKSIMD_V4F32 vector;
00289     
00290     vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
00291     vector.m_data[1] = in_vec1.m_data[1] - in_vec2.m_data[1];
00292     vector.m_data[2] = in_vec1.m_data[2] - in_vec2.m_data[2];
00293     vector.m_data[3] = in_vec1.m_data[3] - in_vec2.m_data[3];
00294     
00295     return vector;
00296 }
00297 
00301 
00302 AkForceInline AKSIMD_V4F32 AKSIMD_SUB_SS_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00303 {
00304     AKSIMD_V4F32 vector;
00305     
00306     vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
00307     vector.m_data[1] = in_vec1.m_data[1];
00308     vector.m_data[2] = in_vec1.m_data[2];
00309     vector.m_data[3] = in_vec1.m_data[3];
00310     
00311     return vector;
00312 }
00313 
00314 // _mm_add_ps
00315 AkForceInline AKSIMD_V4F32 AKSIMD_ADD_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00316 {
00317     AKSIMD_V4F32 vector;
00318     
00319     vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
00320     vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
00321     vector.m_data[2] = in_vec1.m_data[2] + in_vec2.m_data[2];
00322     vector.m_data[3] = in_vec1.m_data[3] + in_vec2.m_data[3];
00323     
00324     return vector;
00325 }
00326 
00327 AkForceInline AKSIMD_V2F32 AKSIMD_ADD_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00328 {
00329     AKSIMD_V2F32 vector;
00330     
00331     vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
00332     vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
00333     
00334     return vector;
00335 }
00336 
00340 AkForceInline AKSIMD_V4F32 AKSIMD_ADD_SS_V4F32( const AKSIMD_V4F32& a, const AKSIMD_V4F32& b )
00341 {
00342     AKSIMD_V4F32 vector;
00343     
00344     vector.m_data[0] = a.m_data[0] + b.m_data[0];
00345     vector.m_data[1] = a.m_data[1];
00346     vector.m_data[2] = a.m_data[2];
00347     vector.m_data[3] = a.m_data[3];
00348     
00349     return vector;
00350 }
00351 
00352 // _mm_mul_ps
00353 AkForceInline AKSIMD_V4F32 AKSIMD_MUL_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00354 {
00355     AKSIMD_V4F32 vector;
00356     
00357     vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
00358     vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
00359     vector.m_data[2] = in_vec1.m_data[2] * in_vec2.m_data[2];
00360     vector.m_data[3] = in_vec1.m_data[3] * in_vec2.m_data[3];
00361     
00362     return vector;
00363 }
00364 
00365 AkForceInline AKSIMD_V2F32 AKSIMD_MUL_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00366 {
00367     AKSIMD_V2F32 vector;
00368     
00369     vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
00370     vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
00371     
00372     return vector;
00373 }
00374 
00379 AkForceInline AKSIMD_V4F32 AKSIMD_MUL_SS_V4F32( const AKSIMD_V4F32& a, const AKSIMD_V4F32& b )
00380 {
00381     AKSIMD_V4F32 vector;
00382     
00383     vector.m_data[0] = a.m_data[0] * b.m_data[0];
00384     vector.m_data[1] = a.m_data[1];
00385     vector.m_data[2] = a.m_data[2];
00386     vector.m_data[3] = a.m_data[3];
00387     
00388     return vector;
00389 }
00390 
00392 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
00393 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
00394 
00396 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( (__a__), (__b__) ), (__c__) )
00397 
00398 // _mm_min_ps
00399 AkForceInline AKSIMD_V4F32 AKSIMD_MIN_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00400 {
00401     AKSIMD_V4F32 vector;
00402     
00403     vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
00404     vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
00405     vector.m_data[2] = AkMin(in_vec1.m_data[2], in_vec2.m_data[2]);
00406     vector.m_data[3] = AkMin(in_vec1.m_data[3], in_vec2.m_data[3]);
00407     
00408     return vector;
00409 }
00410 
00411 AkForceInline AKSIMD_V2F32 AKSIMD_MIN_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00412 {
00413     AKSIMD_V2F32 vector;
00414     
00415     vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
00416     vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
00417     
00418     return vector;
00419 }
00420 
00421 // _mm_max_ps
00422 AkForceInline AKSIMD_V4F32 AKSIMD_MAX_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00423 {
00424     AKSIMD_V4F32 vector;
00425     
00426     vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
00427     vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
00428     vector.m_data[2] = AkMax(in_vec1.m_data[2], in_vec2.m_data[2]);
00429     vector.m_data[3] = AkMax(in_vec1.m_data[3], in_vec2.m_data[3]);
00430     
00431     return vector;
00432 }
00433 
00434 AkForceInline AKSIMD_V2F32 AKSIMD_MAX_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00435 {
00436     AKSIMD_V2F32 vector;
00437     
00438     vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
00439     vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
00440     
00441     return vector;
00442 }
00443 
00444 AkForceInline AKSIMD_V4F32 AKSIMD_ABS_V4F32( const AKSIMD_V4F32& in_vec1 )
00445 {
00446     AKSIMD_V4F32 vector;
00447     vector.m_data[0] = fabs(in_vec1.m_data[0]);
00448     vector.m_data[1] = fabs(in_vec1.m_data[1]);
00449     vector.m_data[2] = fabs(in_vec1.m_data[2]);
00450     vector.m_data[3] = fabs(in_vec1.m_data[3]);
00451     return vector;
00452 }
00453 
00454 AkForceInline AKSIMD_V4F32 AKSIMD_NEG_V4F32( const AKSIMD_V4F32& in_vec1 )
00455 {
00456     AKSIMD_V4F32 vector;
00457     vector.m_data[0] = -in_vec1.m_data[0];
00458     vector.m_data[1] = -in_vec1.m_data[1];
00459     vector.m_data[2] = -in_vec1.m_data[2];
00460     vector.m_data[3] = -in_vec1.m_data[3];
00461     return vector;
00462 }
00463 
00464 // _mm_sqrt_ps
00465 AkForceInline AKSIMD_V4F32 AKSIMD_SQRT_V4F32( const AKSIMD_V4F32& in_vec )
00466 {
00467         AKSIMD_V4F32 vCompare;
00468         AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrt( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
00469         AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrt( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
00470         AKSIMD_GETELEMENT_V4F32(vCompare,2) = sqrt( AKSIMD_GETELEMENT_V4F32(in_vec,2) );
00471         AKSIMD_GETELEMENT_V4F32(vCompare,3) = sqrt( AKSIMD_GETELEMENT_V4F32(in_vec,3) );
00472 
00473         //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
00474 
00475         return vCompare /*res*/;
00476 }
00477 
00478 AkForceInline AKSIMD_V2F32 AKSIMD_SQRT_V2F32( const AKSIMD_V2F32& in_vec )
00479 {
00480     AKSIMD_V2F32 vCompare;
00481     AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrt( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
00482     AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrt( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
00483     
00484     //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
00485     
00486     return vCompare /*res*/;
00487 }
00488 
00490 
00491 
00492 
00495 
00496 
00497 //
00498 // _mm_unpacklo_epi16
00499 // r0 := a0
00500 // r1 := b0
00501 // r2 := a1
00502 // r3 := b1
00503 // r4 := a2
00504 // r5 := b2
00505 // r6 := a3
00506 // r7 := b3
00507 AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKLO_VECTOR8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00508 {
00509     AKSIMD_V4I32 vector;
00510     AkInt16 *pVec1,*pVec2,*pDest;
00511     pVec1 = (AkInt16*)&in_vec1;
00512     pVec2 = (AkInt16*)&in_vec2;
00513     pDest = (AkInt16*)&vector;
00514     
00515     pDest[0] = pVec1[0];
00516     pDest[1] = pVec2[0];    
00517     pDest[2] = pVec1[1];    
00518     pDest[3] = pVec2[1];
00519     pDest[4] = pVec1[2];
00520     pDest[5] = pVec2[2];
00521     pDest[6] = pVec1[3];
00522     pDest[7] = pVec2[3];
00523     
00524     return vector;
00525 }
00526 
00527 // _mm_unpackhi_epi16
00528 AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKHI_VECTOR8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00529 {
00530     AKSIMD_V4I32 vector;
00531     AkInt16 *pVec1,*pVec2,*pDest;
00532     pVec1 = (AkInt16*)&in_vec1;
00533     pVec2 = (AkInt16*)&in_vec2;
00534     pDest = (AkInt16*)&vector;
00535     
00536     pDest[0] = pVec1[4];
00537     pDest[1] = pVec2[4];    
00538     pDest[2] = pVec1[5];    
00539     pDest[3] = pVec2[5];
00540     pDest[4] = pVec1[6];
00541     pDest[5] = pVec2[6];
00542     pDest[6] = pVec1[7];
00543     pDest[7] = pVec2[7];
00544     
00545     return vector;
00546 }
00547 
00548 // _mm_unpacklo_ps
00549 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00550 {
00551     AKSIMD_V4F32 vector;
00552     vector.m_data[0] = in_vec1.m_data[0];
00553     vector.m_data[1] = in_vec2.m_data[0];
00554     vector.m_data[2] = in_vec1.m_data[1];
00555     vector.m_data[3] = in_vec2.m_data[1];
00556     
00557     return vector;
00558 }
00559 
00560 // _mm_unpackhi_ps
00561 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00562 {
00563     AKSIMD_V4F32 vector;
00564     vector.m_data[0] = in_vec1.m_data[2];
00565     vector.m_data[1] = in_vec2.m_data[2];
00566     vector.m_data[2] = in_vec1.m_data[3];
00567     vector.m_data[3] = in_vec2.m_data[3];
00568     
00569     return vector;
00570 }
00571 
00572 // _mm_packs_epi32
00573 AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00574 {
00575     AKSIMD_V4I32 vector;
00576     AkInt16 *pDest = (AkInt16*)&vector;
00577     
00578     pDest[0] = AkClamp( in_vec1.m_data[0], -32768, 32767);
00579     pDest[1] = AkClamp( in_vec1.m_data[1], -32768, 32767);  
00580     pDest[2] = AkClamp( in_vec1.m_data[2], -32768, 32767);  
00581     pDest[3] = AkClamp( in_vec1.m_data[3], -32768, 32767);
00582     pDest[4] = AkClamp( in_vec2.m_data[0], -32768, 32767);
00583     pDest[5] = AkClamp( in_vec2.m_data[1], -32768, 32767);
00584     pDest[6] = AkClamp( in_vec2.m_data[2], -32768, 32767);
00585     pDest[7] = AkClamp( in_vec2.m_data[3], -32768, 32767);
00586     
00587     return vector;
00588 }
00589 
00591 
00592 
00593 
00594 //#define AKSIMD_GET_ITEM( vec, index ) vec[index]
00595 
00596 
00597 
00598 
00601 
00602 
00603 // See _MM_SHUFFLE
00604 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
00605     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
00606 
00607 // See _mm_shuffle_ps
00608 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
00609 //#define AKSIMD_SHUFFLE_V4F32( a, b, zyxw )
00610 
00611  AkForceInline AKSIMD_V4F32 AKSIMD_SHUFFLE_V4F32( const AKSIMD_V4F32& xyzw, const AKSIMD_V4F32& abcd, int mask )
00612 {
00613     AKSIMD_V4F32 vector;
00614     vector.m_data[0] = xyzw.m_data[(mask) & 0x3];
00615     vector.m_data[1] = xyzw.m_data[(mask >> 2) & 0x3];
00616     vector.m_data[2] = abcd.m_data[(mask >> 4) & 0x3];
00617     vector.m_data[3] = abcd.m_data[(mask >> 6) & 0x3];
00618     
00619     return vector;
00620 }
00621 
00622 
00628 #define AKSIMD_MOVEHL_V4F32( a, b ) \
00629     AKSIMD_SHUFFLE_V4F32( (b), (a), AKSIMD_SHUFFLE(3, 2, 3, 2) )
00630 
00636 #define AKSIMD_MOVELH_V4F32( a, b ) \
00637     AKSIMD_SHUFFLE_V4F32( (a), (b), AKSIMD_SHUFFLE(1, 0, 1, 0) )
00638 
00640 #define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1));
00641 
00643 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2));
00644 
00646 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00647 
00649 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00650 
00651 
00652 //#include <AK/SoundEngine/Platforms/Generic/AkSimdShuffle.h>
00653 
00655 
00656 
00657 // Old AKSIMD -- will search-and-replace later
00658 #define AkReal32Vector AKSIMD_V4F32
00659 #define AKSIMD_LOAD1( __scalar__ ) AKSIMD_LOAD1_V4F32( &__scalar__ )
00660 #define AKSIMD_LOADVEC(v) AKSIMD_LOAD_V4F32((const AKSIMD_F32*)((v)))
00661 #define AKSIMD_MUL AKSIMD_MUL_V4F32
00662 #define AKSIMD_STOREVEC AKSIMD_STORE_V4F32
00663 
00668 static AkForceInline void AKSIMD_HORIZONTALADD( AKSIMD_V4F32 & vVec )
00669 {   
00670     AKSIMD_V4F32 vHighLow = AKSIMD_MOVEHL_V4F32(vVec, vVec);
00671     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00672     vHighLow = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0x55);
00673     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00674 } 
00675 
00677 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00678 {
00679     static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f }; 
00680 
00681     AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0)); 
00682     vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00683     AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1)); 
00684     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00685     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
00686     vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 ); 
00687     vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
00688     return vTmp2;
00689 }
00690 
00691 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
00692 
00693 #endif //_AKSIMD_GENERIC_H_
00694