목차

Wwise SDK 2018.1.11
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: <VERSION> Build: <BUILDNUMBER>
25  Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - SSE implementation
32 
33 #ifndef _AK_SIMD_SSE_H_
34 #define _AK_SIMD_SSE_H_
35 
37 #include <xmmintrin.h>
38 
39 ////////////////////////////////////////////////////////////////////////
40 /// @name Platform specific defines for prefetching
41 //@{
42 
43 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
44 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
45 /// Cross-platform memory prefetch of effective address assuming non-temporal data
46 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
47 
48 //@}
49 ////////////////////////////////////////////////////////////////////////
50 
51 ////////////////////////////////////////////////////////////////////////
52 /// @name Platform specific memory size alignment for allocation purposes
53 //@{
54 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
55 //@}
56 ////////////////////////////////////////////////////////////////////////
57 
58 ////////////////////////////////////////////////////////////////////////
59 /// @name AKSIMD types
60 //@{
61 
62 typedef float AKSIMD_F32; ///< 32-bit float
63 typedef __m128 AKSIMD_V4F32; ///< Vector of 4 32-bit floats
64 typedef AKSIMD_V4F32 AKSIMD_V4COND; ///< Vector of 4 comparison results
65 typedef AKSIMD_V4F32 AKSIMD_V4FCOND; ///< Vector of 4 comparison results
66 
67 //@}
68 ////////////////////////////////////////////////////////////////////////
69 
70 
71 ////////////////////////////////////////////////////////////////////////
72 /// @name AKSIMD loading / setting
73 //@{
74 
75 /// Loads four single-precision, floating-point values (see _mm_load_ps)
76 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_load_ps( (AkReal32*)(__addr__) )
77 
78 /// Loads four single-precision floating-point values from unaligned
79 /// memory (see _mm_loadu_ps)
80 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
81 
82 /// Loads a single single-precision, floating-point value, copying it into
83 /// all four words (see _mm_load1_ps, _mm_load_ps1)
84 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
85 
86 /// Sets the four single-precision, floating-point values to in_value (see
87 /// _mm_set1_ps, _mm_set_ps1)
88 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
89 
90 /// Sets the four single-precision, floating-point values to zero (see
91 /// _mm_setzero_ps)
92 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
93 
94 /// Loads a single-precision, floating-point value into the low word
95 /// and clears the upper three words.
96 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
97 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
98 
99 //@}
100 ////////////////////////////////////////////////////////////////////////
101 
102 
103 ////////////////////////////////////////////////////////////////////////
104 /// @name AKSIMD storing
105 //@{
106 
107 /// Stores four single-precision, floating-point values. The address
108 /// must be 16-byte aligned (see _mm_store_ps)
109 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_store_ps( (AkReal32*)(__addr__), (__vec__) )
110 
111 /// Stores four single-precision, floating-point values. The address
112 /// does not need to be 16-byte aligned (see _mm_storeu_ps).
113 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
114 
115 /// Stores the lower single-precision, floating-point value.
116 /// *p := a0 (see _mm_store_ss)
117 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
118 
119 //@}
120 ////////////////////////////////////////////////////////////////////////
121 
122 ////////////////////////////////////////////////////////////////////////
123 /// @name AKSIMD shuffling
124 //@{
125 
126 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
127 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
128 
129 /// Selects four specific single-precision, floating-point values from
130 /// a and b, based on the mask i (see _mm_shuffle_ps)
131 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
132 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
133 
134 /// Moves the upper two single-precision, floating-point values of b to
135 /// the lower two single-precision, floating-point values of the result.
136 /// The upper two single-precision, floating-point values of a are passed
137 /// through to the result.
138 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
139 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
140 
141 /// Moves the lower two single-precision, floating-point values of b to
142 /// the upper two single-precision, floating-point values of the result.
143 /// The lower two single-precision, floating-point values of a are passed
144 /// through to the result.
145 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
146 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
147 
148 /// Swap the 2 lower floats together and the 2 higher floats together.
149 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
150 
151 /// Swap the 2 lower floats with the 2 higher floats.
152 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
153 
154 /// Barrel-shift all floats by one.
155 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
156 
157 /// Duplicates the odd items into the even items (d c b a -> d d b b )
158 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
159 
160 /// Duplicates the even items into the odd items (d c b a -> c c a a )
161 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
162 
163 //@}
164 ////////////////////////////////////////////////////////////////////////
165 
166 
167 ////////////////////////////////////////////////////////////////////////
168 /// @name AKSIMD arithmetic
169 //@{
170 
171 /// Subtracts the four single-precision, floating-point values of
172 /// a and b (a - b) (see _mm_sub_ps)
173 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
174 
175 /// Subtracts the lower single-precision, floating-point values of a and b.
176 /// The upper three single-precision, floating-point values are passed through from a.
177 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
178 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
179 
180 /// Adds the four single-precision, floating-point values of
181 /// a and b (see _mm_add_ps)
182 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
183 
184 /// Adds the lower single-precision, floating-point values of a and b; the
185 /// upper three single-precision, floating-point values are passed through from a.
186 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
187 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
188 
189 /// Multiplies the four single-precision, floating-point values
190 /// of a and b (see _mm_mul_ps)
191 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
192 
193 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
194 
195 /// Multiplies the lower single-precision, floating-point values of
196 /// a and b; the upper three single-precision, floating-point values
197 /// are passed through from a.
198 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
199 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
200 
201 /// Vector multiply-add operation.
202 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
203 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
204 
205 /// Vector multiply-add operation.
206 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
207 
208 /// Computes the minima of the four single-precision, floating-point
209 /// values of a and b (see _mm_min_ps)
210 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
211 
212 /// Computes the maximums of the four single-precision, floating-point
213 /// values of a and b (see _mm_max_ps)
214 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
215 
216 /// Computes the absolute value
217 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
218 
219 /// Changes the sign
220 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
221 
222 /// Vector square root aproximation (see _mm_sqrt_ps)
223 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
224 
225 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
226 #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
227 
228 /// Faked in-place vector horizontal add.
229 /// \akwarning
230 /// Don't expect this to be very efficient.
231 /// \endakwarning
233 {
234  __m128 vHighLow = _mm_movehl_ps(vVec, vVec);
235  vVec = _mm_add_ps(vVec, vHighLow);
236  vHighLow = _mm_shuffle_ps(vVec, vVec, 0x55);
237  vVec = _mm_add_ps(vVec, vHighLow);
238 }
239 
241 {
242  AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
243  AKSIMD_HORIZONTALADD( vfDotProduct );
244  return AKSIMD_SHUFFLE_V4F32( vfDotProduct, vfDotProduct, AKSIMD_SHUFFLE(0,0,0,0) );
245 }
246 
247 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
249 {
250  static const AKSIMD_V4F32 vSign = { -1.f, 1.f, -1.f, 1.f };
251 
252  AKSIMD_V4F32 vTmp1 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(2,2,0,0));
253  vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
254  AKSIMD_V4F32 vTmp2 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(3,3,1,1));
255  vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
256  vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
257  return vTmp2;
258 }
259 
260 #ifdef AK_SSE3
261 
262 #include <pmmintrin.h>
263 
264 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
265 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
266 {
267  AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1); // multiplier real (a1, a1, a0, a0)
268  vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2); // temp1 (a1d1, a1c1, a0d0, a0c0)
269  AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1, d1, c0, d0)
270  AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1); // multiplier imag (b1, b1, b0, b0)
271  xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1); // temp2 (b1c1, b1d1, b0c0, b0d0)
272  AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2); // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0
273  return vCOut;
274 }
275 
276 #endif
277 
278 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
279  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
280 #else
281  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
282 #endif
283 
284 //@}
285 ////////////////////////////////////////////////////////////////////////
286 
287 
288 ////////////////////////////////////////////////////////////////////////
289 /// @name AKSIMD integer arithmetic
290 //@{
291 
292 /// Adds the four integer values of a and b
293 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
294 
295 #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
296 #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
297 #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
298 #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
299 #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
300 
301 /// Multiplies the low 16bits of a by b and stores it in V4I32 (no overflow)
302 #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
303 //@}
304 ////////////////////////////////////////////////////////////////////////
305 
306 
307 ////////////////////////////////////////////////////////////////////////
308 /// @name AKSIMD packing / unpacking
309 //@{
310 
311 /// Selects and interleaves the lower two single-precision, floating-point
312 /// values from a and b (see _mm_unpacklo_ps)
313 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
314 
315 /// Selects and interleaves the upper two single-precision, floating-point
316 /// values from a and b (see _mm_unpackhi_ps)
317 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
318 
319 //@}
320 ////////////////////////////////////////////////////////////////////////
321 
322 ////////////////////////////////////////////////////////////////////////
323 /// @name AKSIMD vector comparison
324 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
325 //@{
326 
327 #define AKSIMD_CMP_CTRLMASK __m128
328 
329 /// Vector "<=" operation (see _mm_cmple_ps)
330 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
331 
332 #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
333 
334 /// Vector ">=" operation (see _mm_cmple_ps)
335 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
336 
337 #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
338 
339 /// Vector "==" operation (see _mm_cmpeq_ps)
340 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
341 
342 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
344 {
345  vB = _mm_and_ps( vB, vMask );
346  vA= _mm_andnot_ps( vMask, vA );
347  return _mm_or_ps( vA, vB );
348 }
349 
350 // (cond1 >= cond2) ? b : a.
351 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
352 
353 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
354 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
355 
356 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
357 
358 #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
359 
360 //@}
361 ////////////////////////////////////////////////////////////////////////
362 
363 #include <emmintrin.h>
364 
365 typedef __m128i AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
366 
368 
369 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
370 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
371 
372 /// Loads aligned 128-bit value (see _mm_loadu_si128)
373 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_load_si128( (__addr__) )
374 
375 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
376 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
377 
378 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
379 
380 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
381 
382 /// Stores four 32-bit integer values.
383 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_store_si128( (__addr__), (__vec__) )
384 
385 /// Stores four 32-bit integer values. The address
386 /// does not need to be 16-byte aligned (see _mm_storeu_si128).
387 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__addr__), (__vec__) )
388 
389 ////////////////////////////////////////////////////////////////////////
390 /// @name AKSIMD conversion
391 //@{
392 
393 /// Converts the four signed 32-bit integer values of a to single-precision,
394 /// floating-point values (see _mm_cvtepi32_ps)
395 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
396 
397 /// Converts the four single-precision, floating-point values of a to signed
398 /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
399 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
400 
401 /// Converts the four single-precision, floating-point values of a to signed
402 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
403 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
404 
405 /// Computes the bitwise AND of the 128-bit value in a and the
406 /// 128-bit value in b (see _mm_and_si128)
407 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
408 
409 /// Compares the 8 signed 16-bit integers in a and the 8 signed
410 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
411 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
412 
413 //@}
414 ////////////////////////////////////////////////////////////////////////
415 
416 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
417 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
418 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
419 
420 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
421 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
422 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
423 
424 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
425 /// integers and saturates (see _mm_packs_epi32)
426 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
427 
428 ////////////////////////////////////////////////////////////////////////
429 /// @name AKSIMD shifting
430 //@{
431 
432 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
433 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
434 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
435  _mm_slli_epi32( (__vec__), (__shiftBy__) )
436 
437 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
438 /// bits while shifting in the sign bit (see _mm_srai_epi32)
439 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
440  _mm_srai_epi32( (__vec__), (__shiftBy__) )
441 
442 //@}
443 ////////////////////////////////////////////////////////////////////////
444 
445 #if defined( AK_CPU_X86 ) && !defined(AK_IOS) /// MMX
446 
447 typedef __m64 AKSIMD_V2F32; ///< Vector of 2 32-bit floats
448 
449 #endif
450 
451 
452 #endif //_AK_SIMD_SSE_H_
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:74
static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT(AKSIMD_V4F32 &vVec, const AKSIMD_V4F32 &vfSigns)
Definition: AkSimd.h:240
#define AKSIMD_MUL_V4F32(a, b)
Definition: AkSimd.h:191
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:78
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:79
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add operation.
Definition: AkSimd.h:202
static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32(AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:343
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:76
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL(const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:248
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:127
#define AkForceInline
Force inlining.
Definition: AkTypes.h:63
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:68
#define AKSIMD_SHUFFLE_BADC(__a__)
Swap the 2 lower floats together and the 2 higher floats together.
Definition: AkSimd.h:149
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:75
static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 &vVec)
Definition: AkSimd.h:232
#define AKSIMD_SHUFFLE_V4F32(a, b, i)
Definition: AkSimd.h:132
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:80