Version
menu_open
link
Wwise SDK 2019.2.15
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: <VERSION> Build: <BUILDNUMBER>
25  Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - SSE implementation
32 
33 #ifndef _AK_SIMD_SSE_H_
34 #define _AK_SIMD_SSE_H_
35 
37 #include <xmmintrin.h>
38 #include <smmintrin.h>
39 #include <emmintrin.h>
40 
41 ////////////////////////////////////////////////////////////////////////
42 /// @name Platform specific defines for prefetching
43 //@{
44 
45 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
46 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
47 /// Cross-platform memory prefetch of effective address assuming non-temporal data
48 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
49 
50 //@}
51 ////////////////////////////////////////////////////////////////////////
52 
53 ////////////////////////////////////////////////////////////////////////
54 /// @name Platform specific memory size alignment for allocation purposes
55 //@{
56 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
57 //@}
58 ////////////////////////////////////////////////////////////////////////
59 
60 ////////////////////////////////////////////////////////////////////////
61 /// @name AKSIMD types
62 //@{
63 
64 typedef float AKSIMD_F32; ///< 32-bit float
65 typedef __m128 AKSIMD_V4F32; ///< Vector of 4 32-bit floats
66 typedef AKSIMD_V4F32 AKSIMD_V4COND; ///< Vector of 4 comparison results
67 typedef AKSIMD_V4F32 AKSIMD_V4FCOND; ///< Vector of 4 comparison results
68 
69 typedef __m128i AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
70 
71 struct AKSIMD_V4I32X2 { ///< Pair of 4 32-bit signed integers
73 };
74 
75 struct AKSIMD_V4I32X4 { ///< Quartet of 4 32-bit signed integers
77 };
78 
80 
81 //@}
82 ////////////////////////////////////////////////////////////////////////
83 
84 
85 ////////////////////////////////////////////////////////////////////////
86 /// @name AKSIMD loading / setting
87 //@{
88 
89 /// Loads four single-precision, floating-point values (see _mm_load_ps)
90 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_load_ps( (AkReal32*)(__addr__) )
91 
92 /// Loads four single-precision floating-point values from unaligned
93 /// memory (see _mm_loadu_ps)
94 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
95 
96 /// Loads a single single-precision, floating-point value, copying it into
97 /// all four words (see _mm_load1_ps, _mm_load_ps1)
98 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
99 
100 /// Sets the four single-precision, floating-point values to in_value (see
101 /// _mm_set1_ps, _mm_set_ps1)
102 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
103 
104 /// Sets the four single-precision, floating-point values to zero (see
105 /// _mm_setzero_ps)
106 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
107 
108 /// Loads a single-precision, floating-point value into the low word
109 /// and clears the upper three words.
110 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
111 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
112 
113 //@}
114 ////////////////////////////////////////////////////////////////////////
115 
116 
117 ////////////////////////////////////////////////////////////////////////
118 /// @name AKSIMD storing
119 //@{
120 
121 /// Stores four single-precision, floating-point values. The address
122 /// must be 16-byte aligned (see _mm_store_ps)
123 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_store_ps( (AkReal32*)(__addr__), (__vec__) )
124 
125 /// Stores four single-precision, floating-point values. The address
126 /// does not need to be 16-byte aligned (see _mm_storeu_ps).
127 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
128 
129 /// Stores the lower single-precision, floating-point value.
130 /// *p := a0 (see _mm_store_ss)
131 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
132 
133 //@}
134 ////////////////////////////////////////////////////////////////////////
135 
136 ////////////////////////////////////////////////////////////////////////
137 /// @name AKSIMD shuffling
138 //@{
139 
140 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
141 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
142 
143 /// Selects four specific single-precision, floating-point values from
144 /// a and b, based on the mask i (see _mm_shuffle_ps)
145 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
146 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
147 
148 #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
149 
150 /// Moves the upper two single-precision, floating-point values of b to
151 /// the lower two single-precision, floating-point values of the result.
152 /// The upper two single-precision, floating-point values of a are passed
153 /// through to the result.
154 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
155 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
156 
157 /// Moves the lower two single-precision, floating-point values of b to
158 /// the upper two single-precision, floating-point values of the result.
159 /// The lower two single-precision, floating-point values of a are passed
160 /// through to the result.
161 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
162 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
163 
164 /// Swap the 2 lower floats together and the 2 higher floats together.
165 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
166 
167 /// Swap the 2 lower floats with the 2 higher floats.
168 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
169 
170 /// Barrel-shift all floats by one.
171 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
172 
173 /// Duplicates the odd items into the even items (d c b a -> d d b b )
174 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
175 
176 /// Duplicates the even items into the odd items (d c b a -> c c a a )
177 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
178 
179 //@}
180 ////////////////////////////////////////////////////////////////////////
181 
182 
183 ////////////////////////////////////////////////////////////////////////
184 /// @name AKSIMD arithmetic
185 //@{
186 
187 /// Subtracts the four single-precision, floating-point values of
188 /// a and b (a - b) (see _mm_sub_ps)
189 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
190 
191 /// Subtracts the lower single-precision, floating-point values of a and b.
192 /// The upper three single-precision, floating-point values are passed through from a.
193 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
194 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
195 
196 /// Adds the four single-precision, floating-point values of
197 /// a and b (see _mm_add_ps)
198 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
199 
200 /// Adds the lower single-precision, floating-point values of a and b; the
201 /// upper three single-precision, floating-point values are passed through from a.
202 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
203 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
204 
205 /// Multiplies the four single-precision, floating-point values
206 /// of a and b (see _mm_mul_ps)
207 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
208 
209 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
210 
211 /// Multiplies the lower single-precision, floating-point values of
212 /// a and b; the upper three single-precision, floating-point values
213 /// are passed through from a.
214 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
215 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
216 
217 /// Vector multiply-add operation.
218 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
219 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
220 
221 /// Vector multiply-add operation.
222 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
223 
224 /// Computes the minima of the four single-precision, floating-point
225 /// values of a and b (see _mm_min_ps)
226 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
227 
228 /// Computes the maximums of the four single-precision, floating-point
229 /// values of a and b (see _mm_max_ps)
230 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
231 
232 /// Computes the absolute value
233 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
234 
235 /// Changes the sign
236 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
237 
238 /// Vector square root aproximation (see _mm_sqrt_ps)
239 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
240 
241 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
242 #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
243 
244 /// Reciprocal of x (1/x)
245 #define AKSIMD_RECIP_V4F32(__a__) _mm_rcp_ps(__a__)
246 
247 /// Binary xor for single-precision floating-point
248 #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
249 
250 /// Rounds to upper value
252 {
253  static const AKSIMD_V4F32 vEpsilon = { 0.49999f, 0.49999f, 0.49999f, 0.49999f };
254  return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, vEpsilon)));
255 }
256 
257 /// Faked in-place vector horizontal add - each element will represent sum of all elements
258 /// \akwarning
259 /// Don't expect this to be very efficient.
260 /// \endakwarning
262 {
263  __m128 vAb = _mm_shuffle_ps(vVec, vVec, 0xB1);
264  __m128 vHaddAb = _mm_add_ps(vVec, vAb);
265  __m128 vHaddCd = _mm_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
266  __m128 vHaddAbcd = _mm_add_ps(vHaddAb, vHaddCd);
267  return vHaddAbcd;
268 }
269 
271 {
272  AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
273  return AKSIMD_HORIZONTALADD_V4F32( vfDotProduct );
274 }
275 
276 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
278 {
279  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
280 
281  AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
282  vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
283  AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
284  vTmp2 = AKSIMD_XOR_V4F32( vTmp2, vSign );
285  vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
286  return vTmp2;
287 }
288 
289 #ifdef AK_SSE3
290 
291 #include <pmmintrin.h>
292 
293 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
294 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
295 {
296  AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1); // multiplier real (a1, a1, a0, a0)
297  vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2); // temp1 (a1d1, a1c1, a0d0, a0c0)
298  AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1); // shuf multiplicand(c1, d1, c0, d0)
299  AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1); // multiplier imag (b1, b1, b0, b0)
300  xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1); // temp2 (b1c1, b1d1, b0c0, b0d0)
301  AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2); // b1c1+a1d1, a1c1-b1d1, a0d0+b0d0, a0c0-b0c0
302  return vCOut;
303 }
304 
305 #endif
306 
307 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
308  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
309 #else
310  #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
311 #endif
312 
313 //@}
314 ////////////////////////////////////////////////////////////////////////
315 
316 
317 ////////////////////////////////////////////////////////////////////////
318 /// @name AKSIMD integer arithmetic
319 //@{
320 
321 /// Adds the four integer values of a and b
322 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
323 
324 #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
325 #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
326 #define AKSIMD_OR_V4I32( a, b ) _mm_or_si128(a,b)
327 #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
328 #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
329 #define AKSIMD_NOT_V4I32( a ) _mm_xor_si128(a,_mm_set1_epi32(~0))
330 
331 #define AKSIMD_OR_V4F32( a, b ) _mm_or_ps(a,b)
332 #define AKSIMD_AND_V4F32( a, b ) _mm_and_ps(a,b)
333 #define AKSIMD_ANDNOT_V4F32( a, b ) _mm_andnot_ps(a,b)
334 
335 #define AKSIMD_OR_V4COND( a, b ) _mm_or_ps(a,b)
336 #define AKSIMD_AND_V4COND( a, b ) _mm_and_ps(a,b)
337 
338 /// Multiplies the low 16bits of a by b and stores it in V4I32 (no overflow)
339 #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
340 //@}
341 ////////////////////////////////////////////////////////////////////////
342 
343 
344 ////////////////////////////////////////////////////////////////////////
345 /// @name AKSIMD packing / unpacking
346 //@{
347 
348 /// Selects and interleaves the lower two single-precision, floating-point
349 /// values from a and b (see _mm_unpacklo_ps)
350 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
351 
352 /// Selects and interleaves the upper two single-precision, floating-point
353 /// values from a and b (see _mm_unpackhi_ps)
354 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
355 
356 // Given four pointers, gathers 32-bits of data from each location,
357 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
358 // e.g. (*addr[0]) := (b a)
359 // e.g. (*addr[1]) := (d c)
360 // e.g. (*addr[2]) := (f e)
361 // e.g. (*addr[3]) := (h g)
362 // return struct has
363 // val[0] := (g e c a)
364 // val[1] := (h f d b)
366 {
367  __m128i data[4] = {
368  _mm_set1_epi32(*(AkInt32*)addr0),
369  _mm_set1_epi32(*(AkInt32*)addr1),
370  _mm_set1_epi32(*(AkInt32*)addr2),
371  _mm_set1_epi32(*(AkInt32*)addr3),
372  };
373 
374  __m128i group[2] = {
375  _mm_unpacklo_epi32(data[0], data[1]),
376  _mm_unpacklo_epi32(data[2], data[3]),
377  };
378 
379  __m128i shuffle = _mm_unpacklo_epi64(group[0], group[1]);
380 
381  AKSIMD_V4I32X2 ret{
382  _mm_srai_epi32(_mm_slli_epi32(shuffle, 16), 16),
383  _mm_srai_epi32(shuffle, 16)
384  };
385  return ret;
386 }
387 
388 // Given four pointers, gathers 64-bits of data from each location,
389 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
390 // e.g. (*addr[0]) := (d c b a)
391 // e.g. (*addr[1]) := (h g f e)
392 // e.g. (*addr[2]) := (l k j i)
393 // e.g. (*addr[3]) := (p o n m)
394 // return struct has
395 // val[0] := (m i e a)
396 // val[1] := (n j f b)
397 // val[2] := (o k g c)
398 // val[3] := (p l h d)
399 
401 {
402  __m128i data[4] = {
403  _mm_set1_epi64x(*(AkInt64*)addr0),
404  _mm_set1_epi64x(*(AkInt64*)addr1),
405  _mm_set1_epi64x(*(AkInt64*)addr2),
406  _mm_set1_epi64x(*(AkInt64*)addr3),
407  };
408 
409  __m128i group[2] = {
410  _mm_unpacklo_epi64(data[0], data[1]),
411  _mm_unpacklo_epi64(data[2], data[3]),
412  };
413 
414  __m128i shuffle[2] = {
415  _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
416  _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
417  };
418 
419  AKSIMD_V4I32X4 ret{
420  _mm_srai_epi32(_mm_slli_epi32(shuffle[0],16),16),
421  _mm_srai_epi32(shuffle[0],16),
422  _mm_srai_epi32(_mm_slli_epi32(shuffle[1],16),16),
423  _mm_srai_epi32(shuffle[1],16),
424  };
425  return ret;
426 }
427 
428 //@}
429 ////////////////////////////////////////////////////////////////////////
430 
431 ////////////////////////////////////////////////////////////////////////
432 /// @name AKSIMD vector comparison
433 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
434 //@{
435 
436 #define AKSIMD_CMP_CTRLMASK __m128
437 
438 /// Vector "<=" operation (see _mm_cmple_ps)
439 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
440 
441 #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
442 
443 /// Vector ">=" operation (see _mm_cmple_ps)
444 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
445 
446 #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
447 
448 /// Vector "==" operation (see _mm_cmpeq_ps)
449 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
450 
451 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
453 {
454  vB = _mm_and_ps( vB, vMask );
455  vA= _mm_andnot_ps( vMask, vA );
456  return _mm_or_ps( vA, vB );
457 }
458 
459 // (cond1 >= cond2) ? b : a.
460 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
461 
462 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
463 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
464 
465 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
466 
467 #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
468 
469 // returns true if every element of the provided vector is zero
471 {
472  return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_setzero_si128())) == 0xFFFF;
473 }
474 #define AKSIMD_TESTZERO_V4F32( __a__ ) AKSIMD_TESTZERO_V4I32(_mm_castps_si128(__a__))
475 #define AKSIMD_TESTZERO_V4COND( __a__ ) AKSIMD_TESTZERO_V4F32(__a__)
476 
477 // returns true if every element of the provided vector is ones
479 {
480  return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_set1_epi32(~0))) == 0xFFFF;
481 }
482 #define AKSIMD_TESTONES_V4F32( __a__ ) AKSIMD_TESTONES_V4I32(_mm_castps_si128(__a__))
483 #define AKSIMD_TESTONES_V4COND( __a__ ) AKSIMD_TESTONES_V4F32(__a__)
484 
485 //@}
486 ////////////////////////////////////////////////////////////////////////
487 
488 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
489 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
490 
491 /// Loads aligned 128-bit value (see _mm_loadu_si128)
492 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_load_si128( (__addr__) )
493 
494 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
495 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
496 
497 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
498 
499 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
500 
501 #define AKSIMD_SETV_V2I64( _b, _a ) _mm_set_epi64x( (_b), (_a) )
502 
503 /// Sets the 32b integer i at the location specified by index in a
504 #define AKSIMD_INSERT_V4I32( a, i, index) _mm_insert_epi32(a, i, index)
505 
506 /// Sets the 64b integer i at the location specified by index in a
507 #define AKSIMD_INSERT_V2I64( a, i, index) _mm_insert_epi64(a, i, index)
508 
509 /// Stores four 32-bit integer values.
510 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_store_si128( (__m128i*)(__addr__), (__vec__) )
511 
512 /// Stores four 32-bit integer values. The address
513 /// does not need to be 16-byte aligned (see _mm_storeu_si128).
514 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
515 
516 ////////////////////////////////////////////////////////////////////////
517 /// @name AKSIMD conversion
518 //@{
519 
520 /// Converts the four signed 32-bit integer values of a to single-precision,
521 /// floating-point values (see _mm_cvtepi32_ps)
522 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
523 
524 /// Converts the four single-precision, floating-point values of a to signed
525 /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
526 #define AKSIMD_ROUND_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
527 
528 /// Converts the four single-precision, floating-point values of a to signed
529 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
530 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
531 
532 /// Computes the bitwise AND of the 128-bit value in a and the
533 /// 128-bit value in b (see _mm_and_si128)
534 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
535 
536 /// Compares the 8 signed 16-bit integers in a and the 8 signed
537 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
538 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
539 
540 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
541 /// vector to 4 full-precision floats
542 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
543 
544 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
545 /// vector to 4 full-precision floats
546 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
547 
549 {
550  __m128i expMantData = _mm_and_si128(vec, _mm_set1_epi32(0x7fff0000));
551  __m128i expMantShifted = _mm_srli_epi32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
552 
553  // magic number to get scale fp16 exp range into fp32 exp range (also renormalize any denorms)
554  __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
555 
556  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' the new inf-bits into the attempt at scaling from inf previously)
557  __m128i infnanCheck = _mm_cmpgt_epi32(expMantData, _mm_set1_epi32(0x7bffffff));
558  __m128i infnanExp = _mm_and_si128(infnanCheck, _mm_set1_epi32(255 << 23));
559  __m128i expMantWithInfNan = _mm_or_si128(expMantFloat, infnanExp);
560 
561  // reincorporate the sign
562  __m128i signData = _mm_and_si128(vec, _mm_set1_epi32(0x80000000));
563  __m128 assembledFloat = _mm_castsi128_ps(_mm_or_si128(signData, expMantWithInfNan));
564  return assembledFloat;
565 }
566 
567 /// Converts the 4 full-precision floats vector to 4 half-precision floats
568 /// occupying the lower bits and leaving the upper bits as zero
570 {
571  __m128i signData = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x80000000));
572  __m128i unsignedVec = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x7fffffff));
573 
574  // do the processing for values that will be denormed in float16
575  // Add 0.5 to get value within range, and rounde; then move mantissa data up
576  __m128 denormedVec = _mm_add_ps(_mm_castsi128_ps(unsignedVec), _mm_set1_ps(0.5f));
577  __m128i denormResult = _mm_slli_epi32(_mm_castps_si128(denormedVec), 16);
578 
579  // processing for values that will be normal in float16
580  __m128i subnormMagic = _mm_set1_epi32(0xC8000FFF); // -131072 + rounding bias
581  __m128i normRoundPart1 = _mm_add_epi32(unsignedVec, subnormMagic);
582  __m128i mantLsb = _mm_slli_epi32(unsignedVec, 31 - 13);
583  __m128i mantSignExtendLsb = _mm_srai_epi32(mantLsb, 31); // Extend Lsb so that it's -1 when set
584  __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
585  __m128i normResult = _mm_slli_epi32(normRoundPart2, 3);
586 
587  // Combine the norm and subnorm paths together
588  __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23); // smallest float32 that yields a normalized float16
589  __m128i denormMask = _mm_cmpgt_epi32(normalMinimum, unsignedVec);
590 
591  __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
592 
593  // apply inf/nan check
594  __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000)); // test if the value will be greater than the max representable by float16
595  __m128i mantissaData = _mm_and_si128(unsignedVec, _mm_set1_epi32(0x007fffff));
596  __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
597  __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
598  __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
599  __m128i infNanFloat = _mm_or_si128(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
600 
601  __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
602 
603  // reincorporate the original sign
604  __m128i signedResult = _mm_or_si128(signData, resultWithInfNan);
605 
606  // store results packed in lower 64 bits, and set upper 64 to zero
607  __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD); // move 16b ints (x,x,x,x,d,c,b,a) down to (x,x,x,x,x,x,d,b)
608  __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD); // move 16b ints (h,g,f,e,x,x,x,x) down to (x,x,h,f,x,x,x,x)
609  __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4); // combine - (x, x, h, f, x, x, d, b)
610  __m128i result = _mm_castps_si128(_mm_shuffle_ps(resultEpi16, _mm_setzero_ps(), 0x8)); // reshuffle with zero - (0,0,0,0,h,f,d,b)
611 
612  return result;
613 }
614 
615 //@}
616 ////////////////////////////////////////////////////////////////////////
617 
618 ////////////////////////////////////////////////////////////////////////
619 /// @name AKSIMD cast
620 //@{
621 
622 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
623 /// used for compilation and does not generate any instructions, thus it has zero latency.
624 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) _mm_castpd_ps(__vec__)
625 
626 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
627 /// used for compilation and does not generate any instructions, thus it has zero latency.
628 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) _mm_castpd_si128(__vec__)
629 
630 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
631 /// used for compilation and does not generate any instructions, thus it has zero latency.
632 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) _mm_castps_pd(__vec__)
633 
634 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
635 /// used for compilation and does not generate any instructions, thus it has zero latency.
636 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
637 
638 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
639 /// used for compilation and does not generate any instructions, thus it has zero latency.
640 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) _mm_castsi128_pd(__vec__)
641 
642 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
643 /// used for compilation and does not generate any instructions, thus it has zero latency.
644 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) _mm_castsi128_ps(__vec__)
645 
646 /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4F32.
647 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) (__vec__)
648 
649 /// Cast vector of type AKSIMD_V4F32 to AKSIMD_V4COND.
650 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) (__vec__)
651 
652 //@}
653 ////////////////////////////////////////////////////////////////////////
654 
655 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
656 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
657 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
658 
659 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
660 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
661 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
662 
663 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
664 /// integers and saturates (see _mm_packs_epi32)
665 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
666 
667 ////////////////////////////////////////////////////////////////////////
668 /// @name AKSIMD shifting
669 //@{
670 
671 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
672 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
673 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
674  _mm_slli_epi32( (__vec__), (__shiftBy__) )
675 
676 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
677 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
678 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
679  _mm_srli_epi32( (__vec__), (__shiftBy__) )
680 
681 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
682 /// bits while shifting in the sign bit (see _mm_srai_epi32)
683 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
684  _mm_srai_epi32( (__vec__), (__shiftBy__) )
685 
686 //@}
687 ////////////////////////////////////////////////////////////////////////
688 
689 #if defined( AK_CPU_X86 ) /// MMX
690 
691 typedef __m64 AKSIMD_V2F32; ///< Vector of 2 32-bit floats
692 
693 #define AKSIMD_SETZERO_V2F32() _mm_setzero_si64();
694 
695 #define AKSIMD_CMPGT_V2I32( a, b ) _mm_cmpgt_pi16(a,b)
696 
697 /// Interleaves the lower 2 signed or unsigned 16-bit integers in a with
698 /// the lower 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
699 #define AKSIMD_UNPACKLO_VECTOR4I16( a, b ) _mm_unpacklo_pi16( a, b )
700 
701 /// Interleaves the upper 2 signed or unsigned 16-bit integers in a with
702 /// the upper 2 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
703 #define AKSIMD_UNPACKHI_VECTOR4I16( a, b ) _mm_unpackhi_pi16( a, b )
704 
705 /// Shifts the 2 signed or unsigned 32-bit integers in a left by
706 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
707 #define AKSIMD_SHIFTLEFT_V2I32( __vec__, __shiftBy__ ) \
708  _mm_slli_pi32( (__vec__), (__shiftBy__) )
709 
710 /// Shifts the 2 signed 32-bit integers in a right by in_shiftBy
711 /// bits while shifting in the sign bit (see _mm_srai_epi32)
712 #define AKSIMD_SHIFTRIGHTARITH_V2I32( __vec__, __shiftBy__ ) \
713  _mm_srai_pi32( (__vec__), (__shiftBy__) )
714 
715 /// Used when ending a block of code that utilizes any MMX construct on x86 code
716 /// so that the x87 FPU can be used again
717 #define AKSIMD_MMX_EMPTY _mm_empty();
718 
719 #endif
720 
721 
722 #endif //_AK_SIMD_SSE_H_
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:73
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:141
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:75
AKSIMD_V4I32 val[4]
< Quartet of 4 32-bit signed integers
Definition: AkSimd.h:76
#define AKSIMD_SHUFFLE_BADC(__a__)
Swap the 2 lower floats together and the 2 higher floats together.
Definition: AkSimd.h:165
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add operation.
Definition: AkSimd.h:218
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:478
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:261
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(AKSIMD_V4I32 vec)
Definition: AkSimd.h:548
static AkForceInline AKSIMD_V4F32 AKSIMD_CEIL_V4F32(const AKSIMD_V4F32 &x)
Rounds to upper value.
Definition: AkSimd.h:251
int64_t AkInt64
Signed 64-bit integer.
Definition: AkTypes.h:99
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:470
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:76
int32_t AkInt32
Signed 32-bit integer.
Definition: AkTypes.h:98
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:77
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:72
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:71
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:569
static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT(AKSIMD_V4F32 &vVec, const AKSIMD_V4F32 &vfSigns)
Definition: AkSimd.h:270
#define AKSIMD_XOR_V4F32(a, b)
Binary xor for single-precision floating-point.
Definition: AkSimd.h:248
int16_t AkInt16
Signed 16-bit integer.
Definition: AkTypes.h:97
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:65
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:400
#define AKSIMD_SHUFFLE_V4F32(a, b, i)
Definition: AkSimd.h:146
#define AKSIMD_MUL_V4F32(a, b)
Definition: AkSimd.h:207
static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32(AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:452
AKSIMD_V4I32 val[2]
< Pair of 4 32-bit signed integers
Definition: AkSimd.h:72
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:365
#define AkForceInline
Definition: AkTypes.h:62
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:277

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise