目录

include/AK/SoundEngine/Platforms/Generic/AkSimd.h

浏览该文件的文档。
00001 /*******************************************************************************
00002 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
00003 released in source code form as part of the SDK installer package.
00004 
00005 Commercial License Usage
00006 
00007 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
00008 may use this file in accordance with the end user license agreement provided 
00009 with the software or, alternatively, in accordance with the terms contained in a
00010 written agreement between you and Audiokinetic Inc.
00011 
00012 Apache License Usage
00013 
00014 Alternatively, this file may be used under the Apache License, Version 2.0 (the 
00015 "Apache License"); you may not use this file except in compliance with the 
00016 Apache License. You may obtain a copy of the Apache License at 
00017 http://www.apache.org/licenses/LICENSE-2.0.
00018 
00019 Unless required by applicable law or agreed to in writing, software distributed
00020 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
00021 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
00022 the specific language governing permissions and limitations under the License.
00023 
00024   Version: <VERSION>  Build: <BUILDNUMBER>
00025   Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
00026 *******************************************************************************/
00027 
00028 // AkSimd.h
00029 
00030 /// \file 
00031 /// AKSIMD - Generic (no SIMD support) implementation
00032 
00033 #ifndef _AKSIMD_GENERIC_H_
00034 #define _AKSIMD_GENERIC_H_
00035 
00036 #include <math.h>
00037 #include <string.h>
00038 #include <AK/SoundEngine/Common/AkTypes.h>
00039 #include <AK/Tools/Common/AkPlatformFuncs.h>
00040 
00041 ////////////////////////////////////////////////////////////////////////
00042 /// @name AKSIMD types
00043 //@{
00044 typedef AkInt32 AKSIMD_I32;                                 ///< 32-bit signed integer
00045 typedef struct { AkInt32 m_data[4]; } AKSIMD_V4I32;         ///< Vector of 4 32-bit signed integers
00046 typedef struct { AkUInt32 m_data[4]; } AKSIMD_V4UI32;       ///< Vector of 4 32-bit signed integers
00047 typedef AkReal32 AKSIMD_F32;                                ///< 32-bit float
00048 typedef struct { AkReal32 m_data[2]; } AKSIMD_V2F32;        ///< Vector of 2 32-bit floats
00049 typedef struct { AkReal32 m_data[4]; } AKSIMD_V4F32;        ///< Vector of 4 32-bit floats
00050 typedef AKSIMD_V4UI32   AKSIMD_V4COND;                      ///< Vector of 4 comparison results
00051 
00052 #pragma pack(push,1)
00053 typedef struct { AkInt32 m_data[4]; }  AKSIMD_V4I32_UNALIGNED;      ///< Unaligned Vector of 4 32-bit signed integers
00054 typedef struct { AkUInt32 m_data[4]; } AKSIMD_V4UI32_UNALIGNED;     ///< Unaligned Vector of 4 32-bit signed integers
00055 typedef struct { AkReal32 m_data[2]; } AKSIMD_V2F32_UNALIGNED;      ///< Unaligned Vector of 2 32-bit floats
00056 typedef struct { AkReal32 m_data[4]; } AKSIMD_V4F32_UNALIGNED;      ///< Unaligned Vector of 4 32-bit floats
00057 #pragma pack(pop)
00058 
00059 //@}
00060 ////////////////////////////////////////////////////////////////////////
00061 
00062 #ifndef AKSIMD_GETELEMENT_V4F32
00063 #define AKSIMD_GETELEMENT_V4F32( __vName, __num__ )             (__vName).m_data[(__num__)]
00064 #endif
00065 
00066 #ifndef AKSIMD_GETELEMENT_V2F32
00067 #define AKSIMD_GETELEMENT_V2F32( __vName, __num__ )             (__vName).m_data[(__num__)]
00068 #endif
00069 
00070 #ifndef AKSIMD_GETELEMENT_V4I32
00071 #define AKSIMD_GETELEMENT_V4I32( __vName, __num__ )             (__vName).m_data[(__num__)]
00072 #endif
00073 
00074 ////////////////////////////////////////////////////////////////////////
00075 /// @name Platform specific memory size alignment for allocation purposes
00076 //@{
00077 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00078 //@}
00079 ////////////////////////////////////////////////////////////////////////
00080 
00081 ////////////////////////////////////////////////////////////////////////
00082 /// @name AKSIMD loading / setting
00083 //@{
00084 #define AKSIMD_LOADU_V4I32( in_pData ) (*(in_pData))
00085 
00086 #define AKSIMD_LOADU_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
00087 
00088 #define AKSIMD_LOAD_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
00089 
00090 AkForceInline AKSIMD_V4F32 AKSIMD_LOAD1_V4F32( AKSIMD_F32 in_value )
00091 {
00092     AKSIMD_V4F32 vector;
00093     vector.m_data[0] = in_value;
00094     vector.m_data[1] = in_value;
00095     vector.m_data[2] = in_value;
00096     vector.m_data[3] = in_value;
00097     
00098     return vector;
00099 }
00100 
00101 // _mm_set_ps1
00102 AkForceInline AKSIMD_V4F32 AKSIMD_SET_V4F32( AKSIMD_F32 in_value )
00103 {
00104     AKSIMD_V4F32 vector;
00105     vector.m_data[0] = in_value;
00106     vector.m_data[1] = in_value;
00107     vector.m_data[2] = in_value;
00108     vector.m_data[3] = in_value;
00109     
00110     return vector;
00111 }
00112 
00113 
00114 AkForceInline AKSIMD_V2F32 AKSIMD_SET_V2F32( AKSIMD_F32 in_value )
00115 {
00116     AKSIMD_V2F32 vector;
00117     vector.m_data[0] = in_value;
00118     vector.m_data[1] = in_value;
00119     
00120     return vector;
00121 }
00122 
00123 // _mm_setzero_ps()
00124 AkForceInline AKSIMD_V4F32 AKSIMD_SETZERO_V4F32()
00125 {
00126     AKSIMD_V4F32 vector;
00127     vector.m_data[0] = 0.f;
00128     vector.m_data[1] = 0.f;
00129     vector.m_data[2] = 0.f;
00130     vector.m_data[3] = 0.f;
00131     
00132     return vector;
00133 }
00134 
00135 AkForceInline AKSIMD_V2F32 AKSIMD_SETZERO_V2F32()
00136 {
00137     AKSIMD_V2F32 vector;
00138     vector.m_data[0] = 0.f;
00139     vector.m_data[1] = 0.f;
00140     
00141     return vector;
00142 }
00143 // _mm_setzero_si128()
00144 AkForceInline AKSIMD_V4I32 AKSIMD_SETZERO_V4I32()
00145 {
00146     AKSIMD_V4I32 vector;
00147     vector.m_data[0] = 0;
00148     vector.m_data[1] = 0;
00149     vector.m_data[2] = 0;
00150     vector.m_data[3] = 0;
00151     
00152     return vector;
00153 }
00154 
00155 
00156 /// Loads a single-precision, floating-point value into the low word
00157 /// and clears the upper three words.
00158 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
00159 AkForceInline AKSIMD_V4F32 AKSIMD_LOAD_SS_V4F32( const AKSIMD_F32* in_pData )
00160 {
00161     AKSIMD_V4F32 vector;
00162     vector.m_data[0] = *in_pData;
00163     vector.m_data[1] = 0.f;
00164     vector.m_data[2] = 0.f;
00165     vector.m_data[3] = 0.f;
00166     
00167     return vector;
00168 }
00169 
00170 //@}
00171 ////////////////////////////////////////////////////////////////////////
00172 
00173 ////////////////////////////////////////////////////////////////////////
00174 /// @name AKSIMD storing
00175 //@{
00176 
00177 // _mm_storeu_ps -- The address does not need to be 16-byte aligned.
00178 #define AKSIMD_STOREU_V4F32( in_pTo, in_vec ) (*(AKSIMD_V4F32*)(in_pTo)) = (in_vec)
00179 
00180 // _mm_store_ps -- The address must be 16-byte aligned.
00181 // ????? _mm_storeu_ps vs _mm_store_ps ?????
00182 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) AKSIMD_STOREU_V4F32(__addr__, __vName__)
00183 
00184 // _mm_storeu_si128
00185 #define AKSIMD_STOREU_V4I32( in_pTo, in_vec ) (*(AKSIMD_V4I32*)(in_pTo)) = (in_vec)
00186 
00187 /// Stores the lower single-precision, floating-point value.
00188 /// *p := a0 (see _mm_store_ss)
00189 AkForceInline void AKSIMD_STORE1_V4F32( AKSIMD_F32* in_pTo, const AKSIMD_V4F32& in_vec )
00190 {
00191     ((AKSIMD_V4F32*)in_pTo)->m_data[0] = in_vec.m_data[0];
00192 }
00193 
00194 //@}
00195 ////////////////////////////////////////////////////////////////////////
00196 
00197 ////////////////////////////////////////////////////////////////////////
00198 /// @name AKSIMD conversion
00199 //@{
00200 
00201 // _mm_cvtepi32_ps
00202 AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4I32_TO_V4F32( const AKSIMD_V4I32& in_from )
00203 {
00204     AKSIMD_V4F32 vector;
00205     vector.m_data[0] = (AkReal32)in_from.m_data[0];
00206     vector.m_data[1] = (AkReal32)in_from.m_data[1];
00207     vector.m_data[2] = (AkReal32)in_from.m_data[2];
00208     vector.m_data[3] = (AkReal32)in_from.m_data[3];
00209     
00210     return vector;
00211 }
00212 // _mm_cvtps_epi32
00213 AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4I32( const AKSIMD_V4F32& in_from )
00214 {
00215     AKSIMD_V4I32 vector;
00216     vector.m_data[0] = (AkInt32)in_from.m_data[0];
00217     vector.m_data[1] = (AkInt32)in_from.m_data[1];
00218     vector.m_data[2] = (AkInt32)in_from.m_data[2];
00219     vector.m_data[3] = (AkInt32)in_from.m_data[3];
00220     
00221     return vector;
00222 }
00223 
00224 //@}
00225 ////////////////////////////////////////////////////////////////////////
00226 
00227 ////////////////////////////////////////////////////////////////////////
00228 /// @name AKSIMD logical operations
00229 //@{
00230 
00231 // _mm_and_si128
00232 AkForceInline AKSIMD_V4I32 AKSIMD_AND_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00233 {
00234     AKSIMD_V4I32 vector;
00235     vector.m_data[0] = in_vec1.m_data[0] & in_vec2.m_data[0];
00236     vector.m_data[1] = in_vec1.m_data[1] & in_vec2.m_data[1];
00237     vector.m_data[2] = in_vec1.m_data[2] & in_vec2.m_data[2];
00238     vector.m_data[3] = in_vec1.m_data[3] & in_vec2.m_data[3];
00239     
00240     return vector;
00241 }
00242 
00243 /// Compares the 8 signed 16-bit integers in a and the 8 signed
00244 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
00245 AkForceInline AKSIMD_V4I32 AKSIMD_CMPGT_V8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00246 {
00247     AKSIMD_V4I32 vector;
00248     
00249     AkInt16 *pVec1,*pVec2,*pVec3;
00250     pVec1 = (AkInt16*)&in_vec1;
00251     pVec2 = (AkInt16*)&in_vec2;
00252     pVec3 = (AkInt16*)&vector;
00253     
00254     pVec3[0] = (pVec1[0] > pVec2[0]) ? 0xffff : 0x0;
00255     pVec3[1] = (pVec1[1] > pVec2[1]) ? 0xffff : 0x0;
00256     pVec3[2] = (pVec1[2] > pVec2[2]) ? 0xffff : 0x0;
00257     pVec3[3] = (pVec1[3] > pVec2[3]) ? 0xffff : 0x0;
00258     pVec3[4] = (pVec1[4] > pVec2[4]) ? 0xffff : 0x0;
00259     pVec3[5] = (pVec1[5] > pVec2[5]) ? 0xffff : 0x0;
00260     pVec3[6] = (pVec1[6] > pVec2[6]) ? 0xffff : 0x0;
00261     pVec3[7] = (pVec1[7] > pVec2[7]) ? 0xffff : 0x0;
00262 
00263     return vector;
00264 }
00265 
00266 /// Compares for less than or equal (see _mm_cmple_ps)
00267 AkForceInline AKSIMD_V4UI32 AKSIMD_CMPLE_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00268 {
00269     AKSIMD_V4UI32 vector;
00270     
00271     vector.m_data[0] = (in_vec1.m_data[0] <= in_vec2.m_data[0]) ? 0xffffffff : 0x0;
00272     vector.m_data[1] = (in_vec1.m_data[1] <= in_vec2.m_data[1]) ? 0xffffffff : 0x0;
00273     vector.m_data[2] = (in_vec1.m_data[2] <= in_vec2.m_data[2]) ? 0xffffffff : 0x0;
00274     vector.m_data[3] = (in_vec1.m_data[3] <= in_vec2.m_data[3]) ? 0xffffffff : 0x0;
00275     
00276     return vector;
00277 }
00278 
00279 AkForceInline AKSIMD_V4F32 AKSIMD_GTEQ_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
00280 {
00281     AKSIMD_V4F32 vector;
00282 
00283     vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] >= in_vec2.m_data[0]) ? 0xffffffff : 0x0);
00284     vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] >= in_vec2.m_data[1]) ? 0xffffffff : 0x0);
00285     vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] >= in_vec2.m_data[2]) ? 0xffffffff : 0x0);
00286     vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] >= in_vec2.m_data[3]) ? 0xffffffff : 0x0);
00287 
00288     return vector;
00289 }
00290 
00291 AkForceInline AKSIMD_V4F32 AKSIMD_GT_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
00292 {
00293     AKSIMD_V4F32 vector;
00294 
00295     vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] > in_vec2.m_data[0]) ? 0xffffffff : 0x0);
00296     vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] > in_vec2.m_data[1]) ? 0xffffffff : 0x0);
00297     vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] > in_vec2.m_data[2]) ? 0xffffffff : 0x0);
00298     vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] > in_vec2.m_data[3]) ? 0xffffffff : 0x0);
00299 
00300     return vector;
00301 }
00302 
00303 AkForceInline AKSIMD_V4F32 AKSIMD_LTEQ_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
00304 {
00305     AKSIMD_V4F32 vector;
00306 
00307     vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] <= in_vec2.m_data[0]) ? 0xffffffff : 0x0);
00308     vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] <= in_vec2.m_data[1]) ? 0xffffffff : 0x0);
00309     vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] <= in_vec2.m_data[2]) ? 0xffffffff : 0x0);
00310     vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] <= in_vec2.m_data[3]) ? 0xffffffff : 0x0);
00311 
00312     return vector;
00313 }
00314 
00315 AkForceInline AKSIMD_V4F32 AKSIMD_LT_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
00316 {
00317     AKSIMD_V4F32 vector;
00318 
00319     vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] < in_vec2.m_data[0]) ? 0xffffffff : 0x0);
00320     vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] < in_vec2.m_data[1]) ? 0xffffffff : 0x0);
00321     vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] < in_vec2.m_data[2]) ? 0xffffffff : 0x0);
00322     vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] < in_vec2.m_data[3]) ? 0xffffffff : 0x0);
00323 
00324     return vector;
00325 }
00326 
00327 AkForceInline AKSIMD_V4F32 AKSIMD_EQ_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
00328 {
00329     AKSIMD_V4F32 vector;
00330 
00331     vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] == in_vec2.m_data[0]) ? 0xffffffff : 0x0);
00332     vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] == in_vec2.m_data[1]) ? 0xffffffff : 0x0);
00333     vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] == in_vec2.m_data[2]) ? 0xffffffff : 0x0);
00334     vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] == in_vec2.m_data[3]) ? 0xffffffff : 0x0);
00335 
00336     return vector;
00337 }
00338 
00339 AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00340 {
00341     AKSIMD_V4F32 vector;
00342     
00343     vector.m_data[0] = (AkReal32)(((AkUInt32)in_vec1.m_data[0]) ^ ((AkUInt32)in_vec2.m_data[0]));
00344     vector.m_data[1] = (AkReal32)(((AkUInt32)in_vec1.m_data[1]) ^ ((AkUInt32)in_vec2.m_data[1]));
00345     vector.m_data[2] = (AkReal32)(((AkUInt32)in_vec1.m_data[2]) ^ ((AkUInt32)in_vec2.m_data[2]));
00346     vector.m_data[3] = (AkReal32)(((AkUInt32)in_vec1.m_data[3]) ^ ((AkUInt32)in_vec2.m_data[3]));
00347     
00348     return vector;
00349 }
00350 
00351 AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTLEFT_V4I32( AKSIMD_V4I32 in_vector, int in_shiftBy)
00352 {
00353     in_vector.m_data[0] <<= in_shiftBy;
00354     in_vector.m_data[1] <<= in_shiftBy;
00355     in_vector.m_data[2] <<= in_shiftBy;
00356     in_vector.m_data[3] <<= in_shiftBy;
00357     
00358     return in_vector;
00359 }
00360 
00361 AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTRIGHTARITH_V4I32( AKSIMD_V4I32 in_vector, int in_shiftBy)
00362 {
00363     in_vector.m_data[0] >>= in_shiftBy;
00364     in_vector.m_data[1] >>= in_shiftBy;
00365     in_vector.m_data[2] >>= in_shiftBy;
00366     in_vector.m_data[3] >>= in_shiftBy;
00367     
00368     return in_vector;
00369 }
00370 
00371 //@}
00372 ////////////////////////////////////////////////////////////////////////
00373 
00374 
00375 ////////////////////////////////////////////////////////////////////////
00376 /// @name AKSIMD arithmetic
00377 //@{
00378 // _mm_sub_ps
00379 AkForceInline AKSIMD_V4F32 AKSIMD_SUB_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00380 {
00381     AKSIMD_V4F32 vector;
00382     
00383     vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
00384     vector.m_data[1] = in_vec1.m_data[1] - in_vec2.m_data[1];
00385     vector.m_data[2] = in_vec1.m_data[2] - in_vec2.m_data[2];
00386     vector.m_data[3] = in_vec1.m_data[3] - in_vec2.m_data[3];
00387     
00388     return vector;
00389 }
00390 
00391 /// Subtracts the lower single-precision, floating-point values of a and b.
00392 /// The upper three single-precision, floating-point values are passed through from a.
00393 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
00394 
00395 AkForceInline AKSIMD_V4F32 AKSIMD_SUB_SS_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00396 {
00397     AKSIMD_V4F32 vector;
00398     
00399     vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
00400     vector.m_data[1] = in_vec1.m_data[1];
00401     vector.m_data[2] = in_vec1.m_data[2];
00402     vector.m_data[3] = in_vec1.m_data[3];
00403     
00404     return vector;
00405 }
00406 
00407 // _mm_add_ps
00408 AkForceInline AKSIMD_V4F32 AKSIMD_ADD_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00409 {
00410     AKSIMD_V4F32 vector;
00411     
00412     vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
00413     vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
00414     vector.m_data[2] = in_vec1.m_data[2] + in_vec2.m_data[2];
00415     vector.m_data[3] = in_vec1.m_data[3] + in_vec2.m_data[3];
00416     
00417     return vector;
00418 }
00419 
00420 AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 ) 
00421 {
00422     AKSIMD_V4F32 vector;
00423     
00424     vector.m_data[0] = in_vec1.m_data[0] / in_vec2.m_data[0];
00425     vector.m_data[1] = in_vec1.m_data[1] / in_vec2.m_data[1];
00426     vector.m_data[2] = in_vec1.m_data[2] / in_vec2.m_data[2];
00427     vector.m_data[3] = in_vec1.m_data[3] / in_vec2.m_data[3];
00428     
00429     return vector;
00430 }
00431 
00432 AkForceInline AKSIMD_V2F32 AKSIMD_ADD_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00433 {
00434     AKSIMD_V2F32 vector;
00435     
00436     vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
00437     vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
00438     
00439     return vector;
00440 }
00441 
00442 /// Adds the lower single-precision, floating-point values of a and b; the
00443 /// upper three single-precision, floating-point values are passed through from a.
00444 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
00445 AkForceInline AKSIMD_V4F32 AKSIMD_ADD_SS_V4F32( const AKSIMD_V4F32& a, const AKSIMD_V4F32& b )
00446 {
00447     AKSIMD_V4F32 vector;
00448     
00449     vector.m_data[0] = a.m_data[0] + b.m_data[0];
00450     vector.m_data[1] = a.m_data[1];
00451     vector.m_data[2] = a.m_data[2];
00452     vector.m_data[3] = a.m_data[3];
00453     
00454     return vector;
00455 }
00456 
00457 // _mm_mul_ps
00458 AkForceInline AKSIMD_V4F32 AKSIMD_MUL_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00459 {
00460     AKSIMD_V4F32 vector;
00461     
00462     vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
00463     vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
00464     vector.m_data[2] = in_vec1.m_data[2] * in_vec2.m_data[2];
00465     vector.m_data[3] = in_vec1.m_data[3] * in_vec2.m_data[3];
00466     
00467     return vector;
00468 }
00469 
00470 AkForceInline AKSIMD_V2F32 AKSIMD_MUL_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00471 {
00472     AKSIMD_V2F32 vector;
00473     
00474     vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
00475     vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
00476     
00477     return vector;
00478 }
00479 
00480 /// Multiplies the lower single-precision, floating-point values of
00481 /// a and b; the upper three single-precision, floating-point values
00482 /// are passed through from a.
00483 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
00484 AkForceInline AKSIMD_V4F32 AKSIMD_MUL_SS_V4F32( const AKSIMD_V4F32& a, const AKSIMD_V4F32& b )
00485 {
00486     AKSIMD_V4F32 vector;
00487     
00488     vector.m_data[0] = a.m_data[0] * b.m_data[0];
00489     vector.m_data[1] = a.m_data[1];
00490     vector.m_data[2] = a.m_data[2];
00491     vector.m_data[3] = a.m_data[3];
00492     
00493     return vector;
00494 }
00495 
00496 /// Vector multiply-add operation.
00497 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
00498 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
00499 
00500 /// Vector multiply-add operation.
00501 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( (__a__), (__b__) ), (__c__) )
00502 
00503 // _mm_min_ps
00504 AkForceInline AKSIMD_V4F32 AKSIMD_MIN_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00505 {
00506     AKSIMD_V4F32 vector;
00507     
00508     vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
00509     vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
00510     vector.m_data[2] = AkMin(in_vec1.m_data[2], in_vec2.m_data[2]);
00511     vector.m_data[3] = AkMin(in_vec1.m_data[3], in_vec2.m_data[3]);
00512     
00513     return vector;
00514 }
00515 
00516 AkForceInline AKSIMD_V2F32 AKSIMD_MIN_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00517 {
00518     AKSIMD_V2F32 vector;
00519     
00520     vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
00521     vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
00522     
00523     return vector;
00524 }
00525 
00526 // _mm_max_ps
00527 AkForceInline AKSIMD_V4F32 AKSIMD_MAX_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00528 {
00529     AKSIMD_V4F32 vector;
00530     
00531     vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
00532     vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
00533     vector.m_data[2] = AkMax(in_vec1.m_data[2], in_vec2.m_data[2]);
00534     vector.m_data[3] = AkMax(in_vec1.m_data[3], in_vec2.m_data[3]);
00535     
00536     return vector;
00537 }
00538 
00539 AkForceInline AKSIMD_V2F32 AKSIMD_MAX_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
00540 {
00541     AKSIMD_V2F32 vector;
00542     
00543     vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
00544     vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
00545     
00546     return vector;
00547 }
00548 
00549 AkForceInline AKSIMD_V4F32 AKSIMD_ABS_V4F32( const AKSIMD_V4F32& in_vec1 )
00550 {
00551     AKSIMD_V4F32 vector;
00552     vector.m_data[0] = fabsf(in_vec1.m_data[0]);
00553     vector.m_data[1] = fabsf(in_vec1.m_data[1]);
00554     vector.m_data[2] = fabsf(in_vec1.m_data[2]);
00555     vector.m_data[3] = fabsf(in_vec1.m_data[3]);
00556     return vector;
00557 }
00558 
00559 AkForceInline AKSIMD_V4F32 AKSIMD_NEG_V4F32( const AKSIMD_V4F32& in_vec1 )
00560 {
00561     AKSIMD_V4F32 vector;
00562     vector.m_data[0] = -in_vec1.m_data[0];
00563     vector.m_data[1] = -in_vec1.m_data[1];
00564     vector.m_data[2] = -in_vec1.m_data[2];
00565     vector.m_data[3] = -in_vec1.m_data[3];
00566     return vector;
00567 }
00568 
00569 // _mm_sqrt_ps
00570 AkForceInline AKSIMD_V4F32 AKSIMD_SQRT_V4F32( const AKSIMD_V4F32& in_vec )
00571 {
00572         AKSIMD_V4F32 vCompare;
00573         AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
00574         AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
00575         AKSIMD_GETELEMENT_V4F32(vCompare,2) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,2) );
00576         AKSIMD_GETELEMENT_V4F32(vCompare,3) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,3) );
00577 
00578         //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
00579 
00580         return vCompare /*res*/;
00581 }
00582 
00583 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
00584 AkForceInline AKSIMD_V4F32 AKSIMD_RSQRT_V4F32(const AKSIMD_V4F32& in_vec)
00585 {
00586     AKSIMD_V4F32 vCompare;
00587     AKSIMD_GETELEMENT_V4F32(vCompare, 0) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 0));
00588     AKSIMD_GETELEMENT_V4F32(vCompare, 1) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 1));
00589     AKSIMD_GETELEMENT_V4F32(vCompare, 2) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 2));
00590     AKSIMD_GETELEMENT_V4F32(vCompare, 3) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 3));
00591 
00592     return vCompare;
00593 }
00594 
00595 AkForceInline AKSIMD_V2F32 AKSIMD_SQRT_V2F32( const AKSIMD_V2F32& in_vec )
00596 {
00597     AKSIMD_V2F32 vCompare;
00598     AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
00599     AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
00600     
00601     //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
00602     
00603     return vCompare /*res*/;
00604 }
00605 
00606 //@}
00607 ////////////////////////////////////////////////////////////////////////
00608 
00609 
00610 ////////////////////////////////////////////////////////////////////////
00611 /// @name AKSIMD packing / unpacking
00612 //@{
00613 
00614 //
00615 // _mm_unpacklo_epi16
00616 // r0 := a0
00617 // r1 := b0
00618 // r2 := a1
00619 // r3 := b1
00620 // r4 := a2
00621 // r5 := b2
00622 // r6 := a3
00623 // r7 := b3
00624 AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKLO_VECTOR8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00625 {
00626     AKSIMD_V4I32 vector;
00627     AkInt16 *pVec1,*pVec2,*pDest;
00628     pVec1 = (AkInt16*)&in_vec1;
00629     pVec2 = (AkInt16*)&in_vec2;
00630     pDest = (AkInt16*)&vector;
00631     
00632     pDest[0] = pVec1[0];
00633     pDest[1] = pVec2[0];    
00634     pDest[2] = pVec1[1];    
00635     pDest[3] = pVec2[1];
00636     pDest[4] = pVec1[2];
00637     pDest[5] = pVec2[2];
00638     pDest[6] = pVec1[3];
00639     pDest[7] = pVec2[3];
00640     
00641     return vector;
00642 }
00643 
00644 // _mm_unpackhi_epi16
00645 AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKHI_VECTOR8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00646 {
00647     AKSIMD_V4I32 vector;
00648     AkInt16 *pVec1,*pVec2,*pDest;
00649     pVec1 = (AkInt16*)&in_vec1;
00650     pVec2 = (AkInt16*)&in_vec2;
00651     pDest = (AkInt16*)&vector;
00652     
00653     pDest[0] = pVec1[4];
00654     pDest[1] = pVec2[4];    
00655     pDest[2] = pVec1[5];    
00656     pDest[3] = pVec2[5];
00657     pDest[4] = pVec1[6];
00658     pDest[5] = pVec2[6];
00659     pDest[6] = pVec1[7];
00660     pDest[7] = pVec2[7];
00661     
00662     return vector;
00663 }
00664 
00665 // _mm_unpacklo_ps
00666 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00667 {
00668     AKSIMD_V4F32 vector;
00669     vector.m_data[0] = in_vec1.m_data[0];
00670     vector.m_data[1] = in_vec2.m_data[0];
00671     vector.m_data[2] = in_vec1.m_data[1];
00672     vector.m_data[3] = in_vec2.m_data[1];
00673     
00674     return vector;
00675 }
00676 
00677 // _mm_unpackhi_ps
00678 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00679 {
00680     AKSIMD_V4F32 vector;
00681     vector.m_data[0] = in_vec1.m_data[2];
00682     vector.m_data[1] = in_vec2.m_data[2];
00683     vector.m_data[2] = in_vec1.m_data[3];
00684     vector.m_data[3] = in_vec2.m_data[3];
00685     
00686     return vector;
00687 }
00688 
00689 // _mm_packs_epi32
00690 AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00691 {
00692     AKSIMD_V4I32 vector;
00693     AkInt16 *pDest = (AkInt16*)&vector;
00694     
00695     pDest[0] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[0], -32768, 32767);
00696     pDest[1] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[1], -32768, 32767);
00697     pDest[2] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[2], -32768, 32767);
00698     pDest[3] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[3], -32768, 32767);
00699     pDest[4] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[0], -32768, 32767);
00700     pDest[5] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[1], -32768, 32767);
00701     pDest[6] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[2], -32768, 32767);
00702     pDest[7] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[3], -32768, 32767);
00703     
00704     return vector;
00705 }
00706 
00707 //@}
00708 ////////////////////////////////////////////////////////////////////////
00709 
00710 
00711 //#define AKSIMD_GET_ITEM( vec, index ) vec[index]
00712 
00713 
00714 
00715 
00716 ////////////////////////////////////////////////////////////////////////
00717 /// @name AKSIMD shuffling
00718 //@{
00719 
00720 // See _MM_SHUFFLE
00721 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
00722     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
00723 
00724 // See _mm_shuffle_ps
00725 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
00726 //#define AKSIMD_SHUFFLE_V4F32( a, b, zyxw )
00727 
00728  AkForceInline AKSIMD_V4F32 AKSIMD_SHUFFLE_V4F32( const AKSIMD_V4F32& xyzw, const AKSIMD_V4F32& abcd, int mask )
00729 {
00730     AKSIMD_V4F32 vector;
00731     vector.m_data[0] = xyzw.m_data[(mask) & 0x3];
00732     vector.m_data[1] = xyzw.m_data[(mask >> 2) & 0x3];
00733     vector.m_data[2] = abcd.m_data[(mask >> 4) & 0x3];
00734     vector.m_data[3] = abcd.m_data[(mask >> 6) & 0x3];
00735     
00736     return vector;
00737 }
00738 
00739 
00740 /// Moves the upper two single-precision, floating-point values of b to
00741 /// the lower two single-precision, floating-point values of the result.
00742 /// The upper two single-precision, floating-point values of a are passed
00743 /// through to the result.
00744 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
00745 #define AKSIMD_MOVEHL_V4F32( a, b ) \
00746     AKSIMD_SHUFFLE_V4F32( (b), (a), AKSIMD_SHUFFLE(3, 2, 3, 2) )
00747 
00748 /// Moves the lower two single-precision, floating-point values of b to
00749 /// the upper two single-precision, floating-point values of the result.
00750 /// The lower two single-precision, floating-point values of a are passed
00751 /// through to the result.
00752 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
00753 #define AKSIMD_MOVELH_V4F32( a, b ) \
00754     AKSIMD_SHUFFLE_V4F32( (a), (b), AKSIMD_SHUFFLE(1, 0, 1, 0) )
00755 
00756 /// Swap the 2 lower floats together and the 2 higher floats together.  
00757 #define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1));
00758 
00759 /// Swap the 2 lower floats with the 2 higher floats.   
00760 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2));
00761 
00762 /// Barrel-shift all floats by one.
00763 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
00764 
00765  /// Duplicates the odd items into the even items (d c b a -> d d b b )
00766 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00767 
00768  /// Duplicates the even items into the odd items (d c b a -> c c a a )
00769 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00770 
00771 
00772 //#include <AK/SoundEngine/Platforms/Generic/AkSimdShuffle.h>
00773 
00774 //@}
00775 ////////////////////////////////////////////////////////////////////////
00776 
00777 // Old AKSIMD -- will search-and-replace later
00778 #define AkReal32Vector AKSIMD_V4F32
00779 #define AKSIMD_LOAD1( __scalar__ ) AKSIMD_LOAD1_V4F32( &__scalar__ )
00780 #define AKSIMD_LOADVEC(v) AKSIMD_LOAD_V4F32((const AKSIMD_F32*)((v)))
00781 #define AKSIMD_MUL AKSIMD_MUL_V4F32
00782 #define AKSIMD_STOREVEC AKSIMD_STORE_V4F32
00783 
00784 /// Faked in-place vector horizontal add. 
00785 /// \akwarning
00786 /// Don't expect this to be very efficient.
00787 /// \endakwarning
00788 static AkForceInline void AKSIMD_HORIZONTALADD( AKSIMD_V4F32 & vVec )
00789 {   
00790     AKSIMD_V4F32 vHighLow = AKSIMD_MOVEHL_V4F32(vVec, vVec);
00791     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00792     vHighLow = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0x55);
00793     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00794 } 
00795 
00796 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
00797 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00798 {
00799     static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f }; 
00800 
00801     AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0)); 
00802     vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00803     AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1)); 
00804     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00805     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
00806     vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 ); 
00807     vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
00808     return vTmp2;
00809 }
00810 
00811 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
00812 
00813 #define AK_SIGN_BIT( val ) (((AkUInt32)val) >> 31)
00814 
00815 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec )
00816 {
00817     return AK_SIGN_BIT(in_vec.m_data[0]) | AK_SIGN_BIT(in_vec.m_data[1]) << 1 | AK_SIGN_BIT(in_vec.m_data[2]) << 2 |  AK_SIGN_BIT(in_vec.m_data[3]) << 3;
00818 }
00819 
00820 #endif //_AKSIMD_GENERIC_H_
00821