Version
menu_open
link
Wwise SDK 2022.1.12
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Copyright (c) 2024 Audiokinetic Inc.
25 *******************************************************************************/
26 
27 // AkSimd.h
28 
29 /// \file
30 /// AKSIMD - SSE implementation
31 
32 #ifndef _AK_SIMD_SSE_H_
33 #define _AK_SIMD_SSE_H_
34 
36 #include <xmmintrin.h>
37 #include <smmintrin.h>
38 #include <emmintrin.h>
39 #if defined(__FMA__) || defined(__AVX2__)
40 #include <immintrin.h>
41 #endif
42 
43 ////////////////////////////////////////////////////////////////////////
44 /// @name Platform specific defines for prefetching
45 //@{
46 
47 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
48 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
49 /// Cross-platform memory prefetch of effective address assuming non-temporal data
50 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
51 
52 //@}
53 ////////////////////////////////////////////////////////////////////////
54 
55 ////////////////////////////////////////////////////////////////////////
56 /// @name Platform specific memory size alignment for allocation purposes
57 //@{
58 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
59 //@}
60 ////////////////////////////////////////////////////////////////////////
61 
62 ////////////////////////////////////////////////////////////////////////
63 /// @name AKSIMD types
64 //@{
65 
66 typedef float AKSIMD_F32; ///< 32-bit float
67 typedef __m128 AKSIMD_V4F32; ///< Vector of 4 32-bit floats
68 typedef AKSIMD_V4F32 AKSIMD_V4COND; ///< Vector of 4 comparison results
69 typedef AKSIMD_V4F32 AKSIMD_V4FCOND; ///< Vector of 4 comparison results
70 
71 typedef __m128i AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
72 
73 struct AKSIMD_V4I32X2 { ///< Pair of 4 32-bit signed integers
75 };
76 
77 struct AKSIMD_V4I32X4 { ///< Quartet of 4 32-bit signed integers
79 };
80 
82 
83 //@}
84 ////////////////////////////////////////////////////////////////////////
85 
86 
87 ////////////////////////////////////////////////////////////////////////
88 /// @name AKSIMD loading / setting
89 //@{
90 
91 /// Loads four single-precision floating-point values from unaligned
92 /// memory (see _mm_loadu_ps)
93 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_loadu_ps( (AkReal32*)(__addr__) )
94 
95 /// Loads four single-precision floating-point values from unaligned
96 /// memory (see _mm_loadu_ps)
97 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
98 
99 /// Loads a single single-precision, floating-point value, copying it into
100 /// all four words (see _mm_load1_ps, _mm_load_ps1)
101 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
102 
103 /// Sets the four single-precision, floating-point values to in_value (see
104 /// _mm_set1_ps, _mm_set_ps1)
105 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
106 
107 /// Sets the two double-precision, floating-point values to in_value
108 #define AKSIMD_SETV_V2F64( _b, _a ) _mm_castpd_ps(_mm_set_pd( (_b), (_a) ))
109 
110 /// Populates the full vector with the 4 floating point elements provided
111 #define AKSIMD_SETV_V4F32( _d, _c, _b, _a ) _mm_set_ps( (_d), (_c), (_b), (_a) )
112 
113 /// Populates the full vector with the mask[3:0], setting each to 0 or ~0
115 {
116  __m128i temp = _mm_set_epi32(8, 4, 2, 1);
117  __m128i xvec = _mm_set1_epi32(x);
118  __m128i xand = _mm_and_si128(xvec, temp);
119  return _mm_castsi128_ps(_mm_cmpeq_epi32(temp, xand));
120 }
121 
122 /// Sets the four single-precision, floating-point values to zero (see
123 /// _mm_setzero_ps)
124 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
125 
126 /// Loads a single-precision, floating-point value into the low word
127 /// and clears the upper three words.
128 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
129 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
130 
131 //@}
132 ////////////////////////////////////////////////////////////////////////
133 
134 
135 ////////////////////////////////////////////////////////////////////////
136 /// @name AKSIMD storing
137 //@{
138 
139 /// Stores four single-precision, floating-point values. The address
140 /// does not need to be 16-byte aligned (see _mm_storeu_ps).
141 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
142 
143 /// Stores four single-precision, floating-point values. The address
144 /// does not need to be 16-byte aligned (see _mm_storeu_ps).
145 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
146 
147 /// Stores the lower single-precision, floating-point value.
148 /// *p := a0 (see _mm_store_ss)
149 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
150 
151 /// Stores the lower double-precision, floating-point value.
152 /// *p := a0 (see _mm_store_sd)
153 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) _mm_store_sd( (AkReal64*)(__addr__), _mm_castps_pd(__vec__) )
154 
155 //@}
156 ////////////////////////////////////////////////////////////////////////
157 
158 ////////////////////////////////////////////////////////////////////////
159 /// @name AKSIMD shuffling
160 //@{
161 
162 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
163 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
164 
165 /// Selects four specific single-precision, floating-point values from
166 /// a and b, based on the mask i (see _mm_shuffle_ps)
167 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
168 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
169 
170 #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
171 
172 /// Moves the upper two single-precision, floating-point values of b to
173 /// the lower two single-precision, floating-point values of the result.
174 /// The upper two single-precision, floating-point values of a are passed
175 /// through to the result.
176 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
177 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
178 
179 /// Moves the lower two single-precision, floating-point values of b to
180 /// the upper two single-precision, floating-point values of the result.
181 /// The lower two single-precision, floating-point values of a are passed
182 /// through to the result.
183 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
184 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
185 
186 /// Swap the 2 lower floats together and the 2 higher floats together.
187 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
188 
189 /// Swap the 2 lower floats with the 2 higher floats.
190 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
191 
192 /// Barrel-shift all floats by one.
193 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
194 
195 /// Duplicates the odd items into the even items (d c b a -> d d b b )
196 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
197 
198 /// Duplicates the even items into the odd items (d c b a -> c c a a )
199 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
200 //@}
201 ////////////////////////////////////////////////////////////////////////
202 
203 
204 ////////////////////////////////////////////////////////////////////////
205 /// @name AKSIMD arithmetic
206 //@{
207 
208 /// Subtracts the four single-precision, floating-point values of
209 /// a and b (a - b) (see _mm_sub_ps)
210 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
211 
212 /// Subtracts the lower single-precision, floating-point values of a and b.
213 /// The upper three single-precision, floating-point values are passed through from a.
214 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
215 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
216 
217 /// Adds the four single-precision, floating-point values of
218 /// a and b (see _mm_add_ps)
219 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
220 
221 /// Adds the lower single-precision, floating-point values of a and b; the
222 /// upper three single-precision, floating-point values are passed through from a.
223 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
224 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
225 
226 /// Multiplies the four single-precision, floating-point values
227 /// of a and b (see _mm_mul_ps)
228 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
229 
230 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
231 
232 /// Multiplies the lower single-precision, floating-point values of
233 /// a and b; the upper three single-precision, floating-point values
234 /// are passed through from a.
235 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
236 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
237 
238 /// Vector multiply-add operation. (if we're targeting a platform or arch with FMA, (AVX2 implies FMA) using the fma intrinsics directly tends to be slightly more desirable)
239 #if defined(__FMA__) || defined(__AVX2__)
240 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_fmadd_ps( (__a__), (__b__) , (__c__) )
241 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_fmsub_ps( (__a__), (__b__) , (__c__) )
242 #else
243 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
244 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
245 #endif
246 
247 /// Vector multiply-add operation.
248 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
249 
250 /// Computes the minima of the four single-precision, floating-point
251 /// values of a and b (see _mm_min_ps)
252 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
253 
254 /// Computes the maximums of the four single-precision, floating-point
255 /// values of a and b (see _mm_max_ps)
256 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
257 
258 /// Computes the absolute value
259 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
260 
261 /// Changes the sign
262 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
263 
264 /// Vector square root aproximation (see _mm_sqrt_ps)
265 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
266 
267 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
268 #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
269 
270 /// Reciprocal of x (1/x)
271 #define AKSIMD_RECIP_V4F32(__a__) _mm_rcp_ps(__a__)
272 
273 /// Binary xor for single-precision floating-point
274 #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
275 
276 /// Rounds to upper value
278 {
279  static const AKSIMD_V4F32 vEpsilon = { 0.49999f, 0.49999f, 0.49999f, 0.49999f };
280  return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, vEpsilon)));
281 }
282 
283 /// Faked in-place vector horizontal add - each element will represent sum of all elements
284 /// \akwarning
285 /// Don't expect this to be very efficient.
286 /// \endakwarning
288 {
289  __m128 vAb = _mm_shuffle_ps(vVec, vVec, 0xB1);
290  __m128 vHaddAb = _mm_add_ps(vVec, vAb);
291  __m128 vHaddCd = _mm_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
292  __m128 vHaddAbcd = _mm_add_ps(vHaddAb, vHaddCd);
293  return vHaddAbcd;
294 }
295 
297 {
298  AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
299  return AKSIMD_HORIZONTALADD_V4F32( vfDotProduct );
300 }
301 
302 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
304 {
305  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
306 
307  AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
308  vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
309  AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
310  vTmp2 = AKSIMD_XOR_V4F32( vTmp2, vSign );
311  vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
312  return vTmp2;
313 }
314 
315 #ifdef AK_SSE3
316 
317 #include <pmmintrin.h>
318 
319 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
320 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
321 {
322  AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1); // multiplier real (a1, a1, a0, a0)
323  vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2); // temp1 (a1d1, a1c1, a0d0, a0c0)
324  AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1, d1, c0, d0)
325  AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1); // multiplier imag (b1, b1, b0, b0)
326  xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1); // temp2 (b1c1, b1d1, b0c0, b0d0)
327  AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2); // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0
328  return vCOut;
329 }
330 
331 #endif
332 
333 #if __SSE3__
334 
335 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
336 // to/from packed elements in b, and store the results in dst.
337 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_addsub_ps( a, b)
338 
339 #else
340 
341 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
342 // to/from packed elements in b, and store the results in dst.
343 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_add_ps( a, _mm_xor_ps(b, AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f)))
344 
345 #endif
346 
347 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
348  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
349 #elif defined(AK_CPU_X86) || defined(AK_CPU_X86_64)
350  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
351 #else
352  #define AKSIMD_ASSERTFLUSHZEROMODE
353 #endif
354 
355 //@}
356 ////////////////////////////////////////////////////////////////////////
357 
358 
359 ////////////////////////////////////////////////////////////////////////
360 /// @name AKSIMD integer arithmetic
361 //@{
362 
363 /// Adds the four integer values of a and b
364 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
365 
366 #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
367 #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
368 #define AKSIMD_OR_V4I32( a, b ) _mm_or_si128(a,b)
369 #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
370 #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
371 #define AKSIMD_NOT_V4I32( a ) _mm_xor_si128(a,_mm_set1_epi32(~0))
372 
373 #define AKSIMD_OR_V4F32( a, b ) _mm_or_ps(a,b)
374 #define AKSIMD_AND_V4F32( a, b ) _mm_and_ps(a,b)
375 #define AKSIMD_ANDNOT_V4F32( a, b ) _mm_andnot_ps(a,b)
376 #define AKSIMD_NOT_V4F32( a ) _mm_xor_ps(a,_mm_castsi128_ps(_mm_set1_epi32(~0)))
377 
378 #define AKSIMD_OR_V4COND( a, b ) _mm_or_ps(a,b)
379 #define AKSIMD_AND_V4COND( a, b ) _mm_and_ps(a,b)
380 
381 /// Multiplies the low 16bits of a by b and stores it in V4I32 (no overflow)
382 #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
383 
384 /// Multiplies the low 32bits of a by b and stores it in V4I32 (no overflow)
386 {
387 #ifdef __SSE4_1__ // use SSE 4.1 version directly where possible
388  return _mm_mullo_epi32(vIn1, vIn2);
389 #else // use SSE 2 otherwise
390  __m128i tmp1 = _mm_mul_epu32(vIn1, vIn2); // mul 2,0
391  __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(vIn1, 4), _mm_srli_si128(vIn2, 4)); // mul 3,1
392  return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); // shuffle results to [63..0] and pack
393 #endif
394 }
395 
396 //@}
397 ////////////////////////////////////////////////////////////////////////
398 
399 
400 ////////////////////////////////////////////////////////////////////////
401 /// @name AKSIMD packing / unpacking
402 //@{
403 
404 /// Selects and interleaves the lower two single-precision, floating-point
405 /// values from a and b (see _mm_unpacklo_ps)
406 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
407 
408 /// Selects and interleaves the upper two single-precision, floating-point
409 /// values from a and b (see _mm_unpackhi_ps)
410 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
411 
412 // Given four pointers, gathers 32-bits of data from each location,
413 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
414 // e.g. (*addr[0]) := (b a)
415 // e.g. (*addr[1]) := (d c)
416 // e.g. (*addr[2]) := (f e)
417 // e.g. (*addr[3]) := (h g)
418 // return struct has
419 // val[0] := (g e c a)
420 // val[1] := (h f d b)
422 {
423  __m128i data[4] = {
424  _mm_set1_epi32(*(AkInt32*)addr0),
425  _mm_set1_epi32(*(AkInt32*)addr1),
426  _mm_set1_epi32(*(AkInt32*)addr2),
427  _mm_set1_epi32(*(AkInt32*)addr3),
428  };
429 
430  __m128i group[2] = {
431  _mm_unpacklo_epi32(data[0], data[1]),
432  _mm_unpacklo_epi32(data[2], data[3]),
433  };
434 
435  __m128i shuffle = _mm_unpacklo_epi64(group[0], group[1]);
436 
437  AKSIMD_V4I32X2 ret{
438  _mm_srai_epi32(_mm_slli_epi32(shuffle, 16), 16),
439  _mm_srai_epi32(shuffle, 16)
440  };
441  return ret;
442 }
443 
444 // Given four pointers, gathers 64-bits of data from each location,
445 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
446 // e.g. (*addr[0]) := (d c b a)
447 // e.g. (*addr[1]) := (h g f e)
448 // e.g. (*addr[2]) := (l k j i)
449 // e.g. (*addr[3]) := (p o n m)
450 // return struct has
451 // val[0] := (m i e a)
452 // val[1] := (n j f b)
453 // val[2] := (o k g c)
454 // val[3] := (p l h d)
455 
457 {
458  __m128i data[4] = {
459  _mm_set1_epi64x(*(AkInt64*)addr0),
460  _mm_set1_epi64x(*(AkInt64*)addr1),
461  _mm_set1_epi64x(*(AkInt64*)addr2),
462  _mm_set1_epi64x(*(AkInt64*)addr3),
463  };
464 
465  __m128i group[2] = {
466  _mm_unpacklo_epi64(data[0], data[1]),
467  _mm_unpacklo_epi64(data[2], data[3]),
468  };
469 
470  __m128i shuffle[2] = {
471  _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
472  _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
473  };
474 
475  AKSIMD_V4I32X4 ret{
476  _mm_srai_epi32(_mm_slli_epi32(shuffle[0],16),16),
477  _mm_srai_epi32(shuffle[0],16),
478  _mm_srai_epi32(_mm_slli_epi32(shuffle[1],16),16),
479  _mm_srai_epi32(shuffle[1],16),
480  };
481  return ret;
482 }
483 
484 //@}
485 ////////////////////////////////////////////////////////////////////////
486 
487 ////////////////////////////////////////////////////////////////////////
488 /// @name AKSIMD vector comparison
489 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
490 //@{
491 
492 #define AKSIMD_CMP_CTRLMASK __m128
493 
494 /// Vector "<=" operation (see _mm_cmple_ps)
495 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
496 
497 #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
498 
499 /// Vector ">=" operation (see _mm_cmple_ps)
500 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
501 
502 #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
503 
504 /// Vector "==" operation (see _mm_cmpeq_ps)
505 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
506 
507 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
509 {
510  vB = _mm_and_ps( vB, vMask );
511  vA= _mm_andnot_ps( vMask, vA );
512  return _mm_or_ps( vA, vB );
513 }
514 
515 // (cond1 >= cond2) ? b : a.
516 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
517 
518 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
519 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
520 
521 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
522 
523 #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
524 
525 // returns true if every element of the provided vector is zero
527 {
528  return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_setzero_si128())) == 0xFFFF;
529 }
530 #define AKSIMD_TESTZERO_V4F32( __a__ ) AKSIMD_TESTZERO_V4I32(_mm_castps_si128(__a__))
531 #define AKSIMD_TESTZERO_V4COND( __a__ ) AKSIMD_TESTZERO_V4F32(__a__)
532 
533 // returns true if every element of the provided vector is ones
535 {
536  return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_set1_epi32(~0))) == 0xFFFF;
537 }
538 #define AKSIMD_TESTONES_V4F32( __a__ ) AKSIMD_TESTONES_V4I32(_mm_castps_si128(__a__))
539 #define AKSIMD_TESTONES_V4COND( __a__ ) AKSIMD_TESTONES_V4F32(__a__)
540 
541 //@}
542 ////////////////////////////////////////////////////////////////////////
543 
544 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
545 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
546 
547 /// Loads aligned 128-bit value (see _mm_loadu_si128)
548 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
549 
550 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
551 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
552 
553 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
554 
555 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
556 
557 #define AKSIMD_SETV_V2I64( _b, _a ) _mm_set_epi64x( (_b), (_a) )
558 
559 /// Sets the 32b integer i at the location specified by index in a
560 #define AKSIMD_INSERT_V4I32( a, i, index) _mm_insert_epi32(a, i, index)
561 
562 /// Sets the 64b integer i at the location specified by index in a
563 #define AKSIMD_INSERT_V2I64( a, i, index) _mm_insert_epi64(a, i, index)
564 
565 /// Stores four 32-bit integer values. The address
566 /// does not need to be 16-byte aligned (see _mm_storeu_si128).
567 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
568 
569 /// Stores four 32-bit integer values. The address
570 /// does not need to be 16-byte aligned (see _mm_storeu_si128).
571 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
572 
573 ////////////////////////////////////////////////////////////////////////
574 /// @name AKSIMD conversion
575 //@{
576 
577 /// Converts the four signed 32-bit integer values of a to single-precision,
578 /// floating-point values (see _mm_cvtepi32_ps)
579 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
580 
581 /// Converts the four single-precision, floating-point values of a to signed
582 /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
583 #define AKSIMD_ROUND_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
584 
585 /// Converts the four single-precision, floating-point values of a to signed
586 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
587 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
588 
589 /// Computes the bitwise AND of the 128-bit value in a and the
590 /// 128-bit value in b (see _mm_and_si128)
591 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
592 
593 /// Compares the 8 signed 16-bit integers in a and the 8 signed
594 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
595 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
596 
597 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
598 /// vector to 4 full-precision floats
599 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
600 
601 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
602 /// vector to 4 full-precision floats
603 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
604 
606 {
607  __m128i expMantData = _mm_and_si128(vec, _mm_set1_epi32(0x7fff0000));
608  __m128i expMantShifted = _mm_srli_epi32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
609 
610  // magic number to get scale fp16 exp range into fp32 exp range (also renormalize any denorms)
611  __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
612 
613  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' the new inf-bits into the attempt at scaling from inf previously)
614  __m128i infnanCheck = _mm_cmpgt_epi32(expMantData, _mm_set1_epi32(0x7bffffff));
615  __m128i infnanExp = _mm_and_si128(infnanCheck, _mm_set1_epi32(255 << 23));
616  __m128i expMantWithInfNan = _mm_or_si128(expMantFloat, infnanExp);
617 
618  // reincorporate the sign
619  __m128i signData = _mm_and_si128(vec, _mm_set1_epi32(0x80000000));
620  __m128 assembledFloat = _mm_castsi128_ps(_mm_or_si128(signData, expMantWithInfNan));
621  return assembledFloat;
622 }
623 
624 /// Converts the 4 full-precision floats vector to 4 half-precision floats
625 /// occupying the lower bits and leaving the upper bits as zero
627 {
628  __m128i signData = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x80000000));
629  __m128i unsignedVec = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x7fffffff));
630 
631  // do the processing for values that will be denormed in float16
632  // Add 0.5 to get value within range, and rounde; then move mantissa data up
633  __m128 denormedVec = _mm_add_ps(_mm_castsi128_ps(unsignedVec), _mm_set1_ps(0.5f));
634  __m128i denormResult = _mm_slli_epi32(_mm_castps_si128(denormedVec), 16);
635 
636  // processing for values that will be normal in float16
637  __m128i subnormMagic = _mm_set1_epi32(0xC8000FFF); // -131072 + rounding bias
638  __m128i normRoundPart1 = _mm_add_epi32(unsignedVec, subnormMagic);
639  __m128i mantLsb = _mm_slli_epi32(unsignedVec, 31 - 13);
640  __m128i mantSignExtendLsb = _mm_srai_epi32(mantLsb, 31); // Extend Lsb so that it's -1 when set
641  __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
642  __m128i normResult = _mm_slli_epi32(normRoundPart2, 3);
643 
644  // Combine the norm and subnorm paths together
645  __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23); // smallest float32 that yields a normalized float16
646  __m128i denormMask = _mm_cmpgt_epi32(normalMinimum, unsignedVec);
647 
648  __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
649 
650  // apply inf/nan check
651  __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000)); // test if the value will be greater than the max representable by float16
652  __m128i mantissaData = _mm_and_si128(unsignedVec, _mm_set1_epi32(0x007fffff));
653  __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
654  __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
655  __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
656  __m128i infNanFloat = _mm_or_si128(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
657 
658  __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
659 
660  // reincorporate the original sign
661  __m128i signedResult = _mm_or_si128(signData, resultWithInfNan);
662 
663  // store results packed in lower 64 bits, and set upper 64 to zero
664  __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD); // move 16b ints (x,x,x,x,d,c,b,a) down to (x,x,x,x,x,x,d,b)
665  __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD); // move 16b ints (h,g,f,e,x,x,x,x) down to (x,x,h,f,x,x,x,x)
666  __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4); // combine - (x, x, h, f, x, x, d, b)
667  __m128i result = _mm_castps_si128(_mm_shuffle_ps(resultEpi16, _mm_setzero_ps(), 0x8)); // reshuffle with zero - (0,0,0,0,h,f,d,b)
668 
669  return result;
670 }
671 
672 //@}
673 ////////////////////////////////////////////////////////////////////////
674 
675 ////////////////////////////////////////////////////////////////////////
676 /// @name AKSIMD cast
677 //@{
678 
679 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
680 /// used for compilation and does not generate any instructions, thus it has zero latency.
681 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) _mm_castpd_ps(__vec__)
682 
683 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
684 /// used for compilation and does not generate any instructions, thus it has zero latency.
685 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) _mm_castpd_si128(__vec__)
686 
687 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
688 /// used for compilation and does not generate any instructions, thus it has zero latency.
689 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) _mm_castps_pd(__vec__)
690 
691 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
692 /// used for compilation and does not generate any instructions, thus it has zero latency.
693 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
694 
695 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
696 /// used for compilation and does not generate any instructions, thus it has zero latency.
697 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) _mm_castsi128_pd(__vec__)
698 
699 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
700 /// used for compilation and does not generate any instructions, thus it has zero latency.
701 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) _mm_castsi128_ps(__vec__)
702 
703 /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4F32.
704 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) (__vec__)
705 
706 /// Cast vector of type AKSIMD_V4F32 to AKSIMD_V4COND.
707 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) (__vec__)
708 
709 /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4I32.
710 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
711 
712 /// Cast vector of type AKSIMD_V4I32 to AKSIMD_V4COND.
713 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) _mm_castsi128_ps(__vec__)
714 
715 //@}
716 ////////////////////////////////////////////////////////////////////////
717 
718 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
719 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
720 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
721 
722 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
723 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
724 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
725 
726 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
727 /// integers and saturates (see _mm_packs_epi32)
728 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
729 
730 ////////////////////////////////////////////////////////////////////////
731 /// @name AKSIMD shifting
732 //@{
733 
734 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
735 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
736 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
737  _mm_slli_epi32( (__vec__), (__shiftBy__) )
738 
739 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
740 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
741 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
742  _mm_srli_epi32( (__vec__), (__shiftBy__) )
743 
744 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
745 /// bits while shifting in the sign bit (see _mm_srai_epi32)
746 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
747  _mm_srai_epi32( (__vec__), (__shiftBy__) )
748 
749 //@}
750 ////////////////////////////////////////////////////////////////////////
751 
752 #if defined( AK_CPU_X86 ) /// MMX
753 
754 typedef __m64 AKSIMD_V2F32; ///< Vector of 2 32-bit floats
755 
756 #define AKSIMD_SETZERO_V2F32() _mm_setzero_si64();
757 
758 #define AKSIMD_CMPGT_V2I32( a, b ) _mm_cmpgt_pi16(a,b)
759 
760 /// Interleaves the lower 2 signed or unsigned 16-bit integers in a with
761 /// the lower 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
762 #define AKSIMD_UNPACKLO_VECTOR4I16( a, b ) _mm_unpacklo_pi16( a, b )
763 
764 /// Interleaves the upper 2 signed or unsigned 16-bit integers in a with
765 /// the upper 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
766 #define AKSIMD_UNPACKHI_VECTOR4I16( a, b ) _mm_unpackhi_pi16( a, b )
767 
768 /// Shifts the 2 signed or unsigned 32-bit integers in a left by
769 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
770 #define AKSIMD_SHIFTLEFT_V2I32( __vec__, __shiftBy__ ) \
771  _mm_slli_pi32( (__vec__), (__shiftBy__) )
772 
773 /// Shifts the 2 signed 32-bit integers in a right by in_shiftBy
774 /// bits while shifting in the sign bit (see _mm_srai_epi32)
775 #define AKSIMD_SHIFTRIGHTARITH_V2I32( __vec__, __shiftBy__ ) \
776  _mm_srai_pi32( (__vec__), (__shiftBy__) )
777 
778 /// Used when ending a block of code that utilizes any MMX construct on x86 code
779 /// so that the x87 FPU can be used again
780 #define AKSIMD_MMX_EMPTY _mm_empty();
781 
782 #endif
783 
784 
785 #endif //_AK_SIMD_SSE_H_
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:72
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:163
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:74
AKSIMD_V4I32 val[4]
< Quartet of 4 32-bit signed integers
Definition: AkSimd.h:78
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Populates the full vector with the mask[3:0], setting each to 0 or ~0.
Definition: AkSimd.h:114
#define AKSIMD_SHUFFLE_BADC(__a__)
Swap the 2 lower floats together and the 2 higher floats together.
Definition: AkSimd.h:187
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add operation. (if we're targeting a platform or arch with FMA, (AVX2 implies FMA) us...
Definition: AkSimd.h:243
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:534
int32_t AkInt32
Signed 32-bit integer.
static AkForceInline AKSIMD_V4I32 AKSIMD_MULLO_V4I32(const AKSIMD_V4I32 vIn1, const AKSIMD_V4I32 vIn2)
Multiplies the low 32bits of a by b and stores it in V4I32 (no overflow)
Definition: AkSimd.h:385
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:287
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(AKSIMD_V4I32 vec)
Definition: AkSimd.h:605
static AkForceInline AKSIMD_V4F32 AKSIMD_CEIL_V4F32(const AKSIMD_V4F32 &x)
Rounds to upper value.
Definition: AkSimd.h:277
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:526
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:75
int16_t AkInt16
Signed 16-bit integer.
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:76
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:71
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:70
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:626
static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT(AKSIMD_V4F32 &vVec, const AKSIMD_V4F32 &vfSigns)
Definition: AkSimd.h:296
int64_t AkInt64
Signed 64-bit integer.
#define AKSIMD_XOR_V4F32(a, b)
Binary xor for single-precision floating-point.
Definition: AkSimd.h:274
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:64
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:456
#define AKSIMD_SHUFFLE_V4F32(a, b, i)
Definition: AkSimd.h:168
#define AKSIMD_MUL_V4F32(a, b)
Definition: AkSimd.h:228
static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32(AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:508
uint32_t AkUInt32
Unsigned 32-bit integer.
AKSIMD_V4I32 val[2]
< Pair of 4 32-bit signed integers
Definition: AkSimd.h:74
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:421
#define AkForceInline
Definition: AkTypes.h:63
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:303

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise