Version
menu_open
link
Wwise SDK 2021.1.14
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: v2021.1.14 Build: 6590
25  Copyright (c) 2006-2023 Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - SSE implementation
32 
33 #ifndef _AK_SIMD_SSE_H_
34 #define _AK_SIMD_SSE_H_
35 
37 #include <xmmintrin.h>
38 #include <smmintrin.h>
39 #include <emmintrin.h>
40 #if defined(__FMA__) || defined(__AVX2__)
41 #include <immintrin.h>
42 #endif
43 
44 ////////////////////////////////////////////////////////////////////////
45 /// @name Platform specific defines for prefetching
46 //@{
47 
48 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
49 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
50 /// Cross-platform memory prefetch of effective address assuming non-temporal data
51 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
52 
53 //@}
54 ////////////////////////////////////////////////////////////////////////
55 
56 ////////////////////////////////////////////////////////////////////////
57 /// @name Platform specific memory size alignment for allocation purposes
58 //@{
59 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
60 //@}
61 ////////////////////////////////////////////////////////////////////////
62 
63 ////////////////////////////////////////////////////////////////////////
64 /// @name AKSIMD types
65 //@{
66 
67 typedef float AKSIMD_F32; ///< 32-bit float
68 typedef __m128 AKSIMD_V4F32; ///< Vector of 4 32-bit floats
69 typedef AKSIMD_V4F32 AKSIMD_V4COND; ///< Vector of 4 comparison results
70 typedef AKSIMD_V4F32 AKSIMD_V4FCOND; ///< Vector of 4 comparison results
71 
72 typedef __m128i AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
73 
74 struct AKSIMD_V4I32X2 { ///< Pair of 4 32-bit signed integers
76 };
77 
78 struct AKSIMD_V4I32X4 { ///< Quartet of 4 32-bit signed integers
80 };
81 
83 
84 //@}
85 ////////////////////////////////////////////////////////////////////////
86 
87 
88 ////////////////////////////////////////////////////////////////////////
89 /// @name AKSIMD loading / setting
90 //@{
91 
92 /// Loads four single-precision floating-point values from unaligned
93 /// memory (see _mm_loadu_ps)
94 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_loadu_ps( (AkReal32*)(__addr__) )
95 
96 /// Loads four single-precision floating-point values from unaligned
97 /// memory (see _mm_loadu_ps)
98 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
99 
100 /// Loads a single single-precision, floating-point value, copying it into
101 /// all four words (see _mm_load1_ps, _mm_load_ps1)
102 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
103 
104 /// Sets the four single-precision, floating-point values to in_value (see
105 /// _mm_set1_ps, _mm_set_ps1)
106 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
107 
108 /// Sets the two double-precision, floating-point values to in_value
109 #define AKSIMD_SETV_V2F64( _b, _a ) _mm_castpd_ps(_mm_set_pd( (_b), (_a) ))
110 
111 /// Populates the full vector with the 4 floating point elements provided
112 #define AKSIMD_SETV_V4F32( _d, _c, _b, _a ) _mm_set_ps( (_d), (_c), (_b), (_a) )
113 
114 /// Populates the full vector with the mask[3:0], setting each to 0 or ~0
116 {
117  __m128i temp = _mm_set_epi32(8, 4, 2, 1);
118  __m128i xvec = _mm_set1_epi32(x);
119  __m128i xand = _mm_and_si128(xvec, temp);
120  return _mm_castsi128_ps(_mm_cmpeq_epi32(temp, xand));
121 }
122 
123 /// Sets the four single-precision, floating-point values to zero (see
124 /// _mm_setzero_ps)
125 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
126 
127 /// Loads a single-precision, floating-point value into the low word
128 /// and clears the upper three words.
129 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
130 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
131 
132 //@}
133 ////////////////////////////////////////////////////////////////////////
134 
135 
136 ////////////////////////////////////////////////////////////////////////
137 /// @name AKSIMD storing
138 //@{
139 
140 /// Stores four single-precision, floating-point values. The address
141 /// does not need to be 16-byte aligned (see _mm_storeu_ps).
142 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
143 
144 /// Stores four single-precision, floating-point values. The address
145 /// does not need to be 16-byte aligned (see _mm_storeu_ps).
146 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
147 
148 /// Stores the lower single-precision, floating-point value.
149 /// *p := a0 (see _mm_store_ss)
150 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
151 
152 //@}
153 ////////////////////////////////////////////////////////////////////////
154 
155 ////////////////////////////////////////////////////////////////////////
156 /// @name AKSIMD shuffling
157 //@{
158 
159 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
160 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
161 
162 /// Selects four specific single-precision, floating-point values from
163 /// a and b, based on the mask i (see _mm_shuffle_ps)
164 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
165 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
166 
167 #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
168 
169 /// Moves the upper two single-precision, floating-point values of b to
170 /// the lower two single-precision, floating-point values of the result.
171 /// The upper two single-precision, floating-point values of a are passed
172 /// through to the result.
173 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
174 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
175 
176 /// Moves the lower two single-precision, floating-point values of b to
177 /// the upper two single-precision, floating-point values of the result.
178 /// The lower two single-precision, floating-point values of a are passed
179 /// through to the result.
180 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
181 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
182 
183 /// Swap the 2 lower floats together and the 2 higher floats together.
184 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
185 
186 /// Swap the 2 lower floats with the 2 higher floats.
187 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
188 
189 /// Barrel-shift all floats by one.
190 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
191 
192 /// Duplicates the odd items into the even items (d c b a -> d d b b )
193 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
194 
195 /// Duplicates the even items into the odd items (d c b a -> c c a a )
196 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
197 //@}
198 ////////////////////////////////////////////////////////////////////////
199 
200 
201 ////////////////////////////////////////////////////////////////////////
202 /// @name AKSIMD arithmetic
203 //@{
204 
205 /// Subtracts the four single-precision, floating-point values of
206 /// a and b (a - b) (see _mm_sub_ps)
207 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
208 
209 /// Subtracts the lower single-precision, floating-point values of a and b.
210 /// The upper three single-precision, floating-point values are passed through from a.
211 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
212 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
213 
214 /// Adds the four single-precision, floating-point values of
215 /// a and b (see _mm_add_ps)
216 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
217 
218 /// Adds the lower single-precision, floating-point values of a and b; the
219 /// upper three single-precision, floating-point values are passed through from a.
220 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
221 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
222 
223 /// Multiplies the four single-precision, floating-point values
224 /// of a and b (see _mm_mul_ps)
225 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
226 
227 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
228 
229 /// Multiplies the lower single-precision, floating-point values of
230 /// a and b; the upper three single-precision, floating-point values
231 /// are passed through from a.
232 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
233 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
234 
235 /// Vector multiply-add operation. (if we're targeting a platform or arch with FMA, (AVX2 implies FMA) using the fma intrinsics directly tends to be slightly more desirable)
236 #if defined(__FMA__) || defined(__AVX2__)
237 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_fmadd_ps( (__a__), (__b__) , (__c__) )
238 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_fmsub_ps( (__a__), (__b__) , (__c__) )
239 #else
240 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
241 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
242 #endif
243 
244 /// Vector multiply-add operation.
245 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
246 
247 /// Computes the minima of the four single-precision, floating-point
248 /// values of a and b (see _mm_min_ps)
249 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
250 
251 /// Computes the maximums of the four single-precision, floating-point
252 /// values of a and b (see _mm_max_ps)
253 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
254 
255 /// Computes the absolute value
256 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
257 
258 /// Changes the sign
259 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
260 
261 /// Vector square root aproximation (see _mm_sqrt_ps)
262 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
263 
264 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
265 #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
266 
267 /// Reciprocal of x (1/x)
268 #define AKSIMD_RECIP_V4F32(__a__) _mm_rcp_ps(__a__)
269 
270 /// Binary xor for single-precision floating-point
271 #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
272 
273 /// Rounds to upper value
275 {
276  static const AKSIMD_V4F32 vEpsilon = { 0.49999f, 0.49999f, 0.49999f, 0.49999f };
277  return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, vEpsilon)));
278 }
279 
280 /// Faked in-place vector horizontal add - each element will represent sum of all elements
281 /// \akwarning
282 /// Don't expect this to be very efficient.
283 /// \endakwarning
285 {
286  __m128 vAb = _mm_shuffle_ps(vVec, vVec, 0xB1);
287  __m128 vHaddAb = _mm_add_ps(vVec, vAb);
288  __m128 vHaddCd = _mm_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
289  __m128 vHaddAbcd = _mm_add_ps(vHaddAb, vHaddCd);
290  return vHaddAbcd;
291 }
292 
294 {
295  AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
296  return AKSIMD_HORIZONTALADD_V4F32( vfDotProduct );
297 }
298 
299 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
301 {
302  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
303 
304  AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
305  vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
306  AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
307  vTmp2 = AKSIMD_XOR_V4F32( vTmp2, vSign );
308  vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
309  return vTmp2;
310 }
311 
312 #ifdef AK_SSE3
313 
314 #include <pmmintrin.h>
315 
316 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
317 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
318 {
319  AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1); // multiplier real (a1, a1, a0, a0)
320  vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2); // temp1 (a1d1, a1c1, a0d0, a0c0)
321  AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1, d1, c0, d0)
322  AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1); // multiplier imag (b1, b1, b0, b0)
323  xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1); // temp2 (b1c1, b1d1, b0c0, b0d0)
324  AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2); // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0
325  return vCOut;
326 }
327 
328 #endif
329 
330 #if __SSE3__
331 
332 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
333 // to/from packed elements in b, and store the results in dst.
334 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_addsub_ps( a, b)
335 
336 #else
337 
338 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
339 // to/from packed elements in b, and store the results in dst.
340 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_add_ps( a, _mm_xor_ps(b, AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f)))
341 
342 #endif
343 
344 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
345  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
346 #else
347  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
348 #endif
349 
350 //@}
351 ////////////////////////////////////////////////////////////////////////
352 
353 
354 ////////////////////////////////////////////////////////////////////////
355 /// @name AKSIMD integer arithmetic
356 //@{
357 
358 /// Adds the four integer values of a and b
359 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
360 
361 #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
362 #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
363 #define AKSIMD_OR_V4I32( a, b ) _mm_or_si128(a,b)
364 #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
365 #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
366 #define AKSIMD_NOT_V4I32( a ) _mm_xor_si128(a,_mm_set1_epi32(~0))
367 
368 #define AKSIMD_OR_V4F32( a, b ) _mm_or_ps(a,b)
369 #define AKSIMD_AND_V4F32( a, b ) _mm_and_ps(a,b)
370 #define AKSIMD_ANDNOT_V4F32( a, b ) _mm_andnot_ps(a,b)
371 #define AKSIMD_NOT_V4F32( a ) _mm_xor_ps(a,_mm_castsi128_ps(_mm_set1_epi32(~0)))
372 
373 #define AKSIMD_OR_V4COND( a, b ) _mm_or_ps(a,b)
374 #define AKSIMD_AND_V4COND( a, b ) _mm_and_ps(a,b)
375 
376 /// Multiplies the low 16bits of a by b and stores it in V4I32 (no overflow)
377 #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
378 
379 /// Multiplies the low 32bits of a by b and stores it in V4I32 (no overflow)
381 {
382 #ifdef __SSE4_1__ // use SSE 4.1 version directly where possible
383  return _mm_mullo_epi32(vIn1, vIn2);
384 #else // use SSE 2 otherwise
385  __m128i tmp1 = _mm_mul_epu32(vIn1, vIn2); // mul 2,0
386  __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(vIn1, 4), _mm_srli_si128(vIn2, 4)); // mul 3,1
387  return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); // shuffle results to [63..0] and pack
388 #endif
389 }
390 
391 //@}
392 ////////////////////////////////////////////////////////////////////////
393 
394 
395 ////////////////////////////////////////////////////////////////////////
396 /// @name AKSIMD packing / unpacking
397 //@{
398 
399 /// Selects and interleaves the lower two single-precision, floating-point
400 /// values from a and b (see _mm_unpacklo_ps)
401 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
402 
403 /// Selects and interleaves the upper two single-precision, floating-point
404 /// values from a and b (see _mm_unpackhi_ps)
405 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
406 
407 // Given four pointers, gathers 32-bits of data from each location,
408 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
409 // e.g. (*addr[0]) := (b a)
410 // e.g. (*addr[1]) := (d c)
411 // e.g. (*addr[2]) := (f e)
412 // e.g. (*addr[3]) := (h g)
413 // return struct has
414 // val[0] := (g e c a)
415 // val[1] := (h f d b)
417 {
418  __m128i data[4] = {
419  _mm_set1_epi32(*(AkInt32*)addr0),
420  _mm_set1_epi32(*(AkInt32*)addr1),
421  _mm_set1_epi32(*(AkInt32*)addr2),
422  _mm_set1_epi32(*(AkInt32*)addr3),
423  };
424 
425  __m128i group[2] = {
426  _mm_unpacklo_epi32(data[0], data[1]),
427  _mm_unpacklo_epi32(data[2], data[3]),
428  };
429 
430  __m128i shuffle = _mm_unpacklo_epi64(group[0], group[1]);
431 
432  AKSIMD_V4I32X2 ret{
433  _mm_srai_epi32(_mm_slli_epi32(shuffle, 16), 16),
434  _mm_srai_epi32(shuffle, 16)
435  };
436  return ret;
437 }
438 
439 // Given four pointers, gathers 64-bits of data from each location,
440 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
441 // e.g. (*addr[0]) := (d c b a)
442 // e.g. (*addr[1]) := (h g f e)
443 // e.g. (*addr[2]) := (l k j i)
444 // e.g. (*addr[3]) := (p o n m)
445 // return struct has
446 // val[0] := (m i e a)
447 // val[1] := (n j f b)
448 // val[2] := (o k g c)
449 // val[3] := (p l h d)
450 
452 {
453  __m128i data[4] = {
454  _mm_set1_epi64x(*(AkInt64*)addr0),
455  _mm_set1_epi64x(*(AkInt64*)addr1),
456  _mm_set1_epi64x(*(AkInt64*)addr2),
457  _mm_set1_epi64x(*(AkInt64*)addr3),
458  };
459 
460  __m128i group[2] = {
461  _mm_unpacklo_epi64(data[0], data[1]),
462  _mm_unpacklo_epi64(data[2], data[3]),
463  };
464 
465  __m128i shuffle[2] = {
466  _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
467  _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
468  };
469 
470  AKSIMD_V4I32X4 ret{
471  _mm_srai_epi32(_mm_slli_epi32(shuffle[0],16),16),
472  _mm_srai_epi32(shuffle[0],16),
473  _mm_srai_epi32(_mm_slli_epi32(shuffle[1],16),16),
474  _mm_srai_epi32(shuffle[1],16),
475  };
476  return ret;
477 }
478 
479 //@}
480 ////////////////////////////////////////////////////////////////////////
481 
482 ////////////////////////////////////////////////////////////////////////
483 /// @name AKSIMD vector comparison
484 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
485 //@{
486 
487 #define AKSIMD_CMP_CTRLMASK __m128
488 
489 /// Vector "<=" operation (see _mm_cmple_ps)
490 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
491 
492 #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
493 
494 /// Vector ">=" operation (see _mm_cmple_ps)
495 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
496 
497 #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
498 
499 /// Vector "==" operation (see _mm_cmpeq_ps)
500 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
501 
502 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
504 {
505  vB = _mm_and_ps( vB, vMask );
506  vA= _mm_andnot_ps( vMask, vA );
507  return _mm_or_ps( vA, vB );
508 }
509 
510 // (cond1 >= cond2) ? b : a.
511 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
512 
513 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
514 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
515 
516 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
517 
518 #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
519 
520 // returns true if every element of the provided vector is zero
522 {
523  return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_setzero_si128())) == 0xFFFF;
524 }
525 #define AKSIMD_TESTZERO_V4F32( __a__ ) AKSIMD_TESTZERO_V4I32(_mm_castps_si128(__a__))
526 #define AKSIMD_TESTZERO_V4COND( __a__ ) AKSIMD_TESTZERO_V4F32(__a__)
527 
528 // returns true if every element of the provided vector is ones
530 {
531  return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_set1_epi32(~0))) == 0xFFFF;
532 }
533 #define AKSIMD_TESTONES_V4F32( __a__ ) AKSIMD_TESTONES_V4I32(_mm_castps_si128(__a__))
534 #define AKSIMD_TESTONES_V4COND( __a__ ) AKSIMD_TESTONES_V4F32(__a__)
535 
536 //@}
537 ////////////////////////////////////////////////////////////////////////
538 
539 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
540 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
541 
542 /// Loads aligned 128-bit value (see _mm_loadu_si128)
543 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
544 
545 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
546 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
547 
548 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
549 
550 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
551 
552 #define AKSIMD_SETV_V2I64( _b, _a ) _mm_set_epi64x( (_b), (_a) )
553 
554 /// Sets the 32b integer i at the location specified by index in a
555 #define AKSIMD_INSERT_V4I32( a, i, index) _mm_insert_epi32(a, i, index)
556 
557 /// Sets the 64b integer i at the location specified by index in a
558 #define AKSIMD_INSERT_V2I64( a, i, index) _mm_insert_epi64(a, i, index)
559 
560 /// Stores four 32-bit integer values. The address
561 /// does not need to be 16-byte aligned (see _mm_storeu_si128).
562 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
563 
564 /// Stores four 32-bit integer values. The address
565 /// does not need to be 16-byte aligned (see _mm_storeu_si128).
566 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
567 
568 ////////////////////////////////////////////////////////////////////////
569 /// @name AKSIMD conversion
570 //@{
571 
572 /// Converts the four signed 32-bit integer values of a to single-precision,
573 /// floating-point values (see _mm_cvtepi32_ps)
574 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
575 
576 /// Converts the four single-precision, floating-point values of a to signed
577 /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
578 #define AKSIMD_ROUND_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
579 
580 /// Converts the four single-precision, floating-point values of a to signed
581 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
582 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
583 
584 /// Computes the bitwise AND of the 128-bit value in a and the
585 /// 128-bit value in b (see _mm_and_si128)
586 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
587 
588 /// Compares the 8 signed 16-bit integers in a and the 8 signed
589 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
590 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
591 
592 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
593 /// vector to 4 full-precision floats
594 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
595 
596 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
597 /// vector to 4 full-precision floats
598 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
599 
601 {
602  __m128i expMantData = _mm_and_si128(vec, _mm_set1_epi32(0x7fff0000));
603  __m128i expMantShifted = _mm_srli_epi32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
604 
605  // magic number to get scale fp16 exp range into fp32 exp range (also renormalize any denorms)
606  __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
607 
608  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' the new inf-bits into the attempt at scaling from inf previously)
609  __m128i infnanCheck = _mm_cmpgt_epi32(expMantData, _mm_set1_epi32(0x7bffffff));
610  __m128i infnanExp = _mm_and_si128(infnanCheck, _mm_set1_epi32(255 << 23));
611  __m128i expMantWithInfNan = _mm_or_si128(expMantFloat, infnanExp);
612 
613  // reincorporate the sign
614  __m128i signData = _mm_and_si128(vec, _mm_set1_epi32(0x80000000));
615  __m128 assembledFloat = _mm_castsi128_ps(_mm_or_si128(signData, expMantWithInfNan));
616  return assembledFloat;
617 }
618 
619 /// Converts the 4 full-precision floats vector to 4 half-precision floats
620 /// occupying the lower bits and leaving the upper bits as zero
622 {
623  __m128i signData = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x80000000));
624  __m128i unsignedVec = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x7fffffff));
625 
626  // do the processing for values that will be denormed in float16
627  // Add 0.5 to get value within range, and rounde; then move mantissa data up
628  __m128 denormedVec = _mm_add_ps(_mm_castsi128_ps(unsignedVec), _mm_set1_ps(0.5f));
629  __m128i denormResult = _mm_slli_epi32(_mm_castps_si128(denormedVec), 16);
630 
631  // processing for values that will be normal in float16
632  __m128i subnormMagic = _mm_set1_epi32(0xC8000FFF); // -131072 + rounding bias
633  __m128i normRoundPart1 = _mm_add_epi32(unsignedVec, subnormMagic);
634  __m128i mantLsb = _mm_slli_epi32(unsignedVec, 31 - 13);
635  __m128i mantSignExtendLsb = _mm_srai_epi32(mantLsb, 31); // Extend Lsb so that it's -1 when set
636  __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
637  __m128i normResult = _mm_slli_epi32(normRoundPart2, 3);
638 
639  // Combine the norm and subnorm paths together
640  __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23); // smallest float32 that yields a normalized float16
641  __m128i denormMask = _mm_cmpgt_epi32(normalMinimum, unsignedVec);
642 
643  __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
644 
645  // apply inf/nan check
646  __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000)); // test if the value will be greater than the max representable by float16
647  __m128i mantissaData = _mm_and_si128(unsignedVec, _mm_set1_epi32(0x007fffff));
648  __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
649  __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
650  __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
651  __m128i infNanFloat = _mm_or_si128(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
652 
653  __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
654 
655  // reincorporate the original sign
656  __m128i signedResult = _mm_or_si128(signData, resultWithInfNan);
657 
658  // store results packed in lower 64 bits, and set upper 64 to zero
659  __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD); // move 16b ints (x,x,x,x,d,c,b,a) down to (x,x,x,x,x,x,d,b)
660  __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD); // move 16b ints (h,g,f,e,x,x,x,x) down to (x,x,h,f,x,x,x,x)
661  __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4); // combine - (x, x, h, f, x, x, d, b)
662  __m128i result = _mm_castps_si128(_mm_shuffle_ps(resultEpi16, _mm_setzero_ps(), 0x8)); // reshuffle with zero - (0,0,0,0,h,f,d,b)
663 
664  return result;
665 }
666 
667 //@}
668 ////////////////////////////////////////////////////////////////////////
669 
670 ////////////////////////////////////////////////////////////////////////
671 /// @name AKSIMD cast
672 //@{
673 
674 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
675 /// used for compilation and does not generate any instructions, thus it has zero latency.
676 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) _mm_castpd_ps(__vec__)
677 
678 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
679 /// used for compilation and does not generate any instructions, thus it has zero latency.
680 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) _mm_castpd_si128(__vec__)
681 
682 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
683 /// used for compilation and does not generate any instructions, thus it has zero latency.
684 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) _mm_castps_pd(__vec__)
685 
686 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
687 /// used for compilation and does not generate any instructions, thus it has zero latency.
688 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
689 
690 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
691 /// used for compilation and does not generate any instructions, thus it has zero latency.
692 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) _mm_castsi128_pd(__vec__)
693 
694 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
695 /// used for compilation and does not generate any instructions, thus it has zero latency.
696 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) _mm_castsi128_ps(__vec__)
697 
698 /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4F32.
699 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) (__vec__)
700 
701 /// Cast vector of type AKSIMD_V4F32 to AKSIMD_V4COND.
702 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) (__vec__)
703 
704 //@}
705 ////////////////////////////////////////////////////////////////////////
706 
707 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
708 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
709 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
710 
711 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
712 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
713 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
714 
715 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
716 /// integers and saturates (see _mm_packs_epi32)
717 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
718 
719 ////////////////////////////////////////////////////////////////////////
720 /// @name AKSIMD shifting
721 //@{
722 
723 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
724 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
725 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
726  _mm_slli_epi32( (__vec__), (__shiftBy__) )
727 
728 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
729 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
730 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
731  _mm_srli_epi32( (__vec__), (__shiftBy__) )
732 
733 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
734 /// bits while shifting in the sign bit (see _mm_srai_epi32)
735 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
736  _mm_srai_epi32( (__vec__), (__shiftBy__) )
737 
738 //@}
739 ////////////////////////////////////////////////////////////////////////
740 
741 #if defined( AK_CPU_X86 ) /// MMX
742 
743 typedef __m64 AKSIMD_V2F32; ///< Vector of 2 32-bit floats
744 
745 #define AKSIMD_SETZERO_V2F32() _mm_setzero_si64();
746 
747 #define AKSIMD_CMPGT_V2I32( a, b ) _mm_cmpgt_pi16(a,b)
748 
749 /// Interleaves the lower 2 signed or unsigned 16-bit integers in a with
750 /// the lower 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
751 #define AKSIMD_UNPACKLO_VECTOR4I16( a, b ) _mm_unpacklo_pi16( a, b )
752 
753 /// Interleaves the upper 2 signed or unsigned 16-bit integers in a with
754 /// the upper 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
755 #define AKSIMD_UNPACKHI_VECTOR4I16( a, b ) _mm_unpackhi_pi16( a, b )
756 
757 /// Shifts the 2 signed or unsigned 32-bit integers in a left by
758 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
759 #define AKSIMD_SHIFTLEFT_V2I32( __vec__, __shiftBy__ ) \
760  _mm_slli_pi32( (__vec__), (__shiftBy__) )
761 
762 /// Shifts the 2 signed 32-bit integers in a right by in_shiftBy
763 /// bits while shifting in the sign bit (see _mm_srai_epi32)
764 #define AKSIMD_SHIFTRIGHTARITH_V2I32( __vec__, __shiftBy__ ) \
765  _mm_srai_pi32( (__vec__), (__shiftBy__) )
766 
767 /// Used when ending a block of code that utilizes any MMX construct on x86 code
768 /// so that the x87 FPU can be used again
769 #define AKSIMD_MMX_EMPTY _mm_empty();
770 
771 #endif
772 
773 
774 #endif //_AK_SIMD_SSE_H_
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:73
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:160
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:75
AKSIMD_V4I32 val[4]
< Quartet of 4 32-bit signed integers
Definition: AkSimd.h:79
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Populates the full vector with the mask[3:0], setting each to 0 or ~0.
Definition: AkSimd.h:115
#define AKSIMD_SHUFFLE_BADC(__a__)
Swap the 2 lower floats together and the 2 higher floats together.
Definition: AkSimd.h:184
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add operation. (if we're targeting a platform or arch with FMA, (AVX2 implies FMA) us...
Definition: AkSimd.h:240
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:529
int16_t AkInt16
Signed 16-bit integer.
Definition: AkTypes.h:63
static AkForceInline AKSIMD_V4I32 AKSIMD_MULLO_V4I32(const AKSIMD_V4I32 vIn1, const AKSIMD_V4I32 vIn2)
Multiplies the low 32bits of a by b and stores it in V4I32 (no overflow)
Definition: AkSimd.h:380
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:284
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(AKSIMD_V4I32 vec)
Definition: AkSimd.h:600
static AkForceInline AKSIMD_V4F32 AKSIMD_CEIL_V4F32(const AKSIMD_V4F32 &x)
Rounds to upper value.
Definition: AkSimd.h:274
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:521
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:76
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:77
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:72
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:71
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:621
static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT(AKSIMD_V4F32 &vVec, const AKSIMD_V4F32 &vfSigns)
Definition: AkSimd.h:293
#define AKSIMD_XOR_V4F32(a, b)
Binary xor for single-precision floating-point.
Definition: AkSimd.h:271
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:65
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:451
#define AKSIMD_SHUFFLE_V4F32(a, b, i)
Definition: AkSimd.h:165
#define AKSIMD_MUL_V4F32(a, b)
Definition: AkSimd.h:225
static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32(AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:503
uint32_t AkUInt32
Unsigned 32-bit integer.
Definition: AkTypes.h:59
AKSIMD_V4I32 val[2]
< Pair of 4 32-bit signed integers
Definition: AkSimd.h:75
int32_t AkInt32
Signed 32-bit integer.
Definition: AkTypes.h:64
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:416
#define AkForceInline
Definition: AkTypes.h:60
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:300
int64_t AkInt64
Signed 64-bit integer.
Definition: AkTypes.h:65

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise