Table of Contents

Wwise SDK 2019.1.6
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: <VERSION> Build: <BUILDNUMBER>
25  Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - Generic (no SIMD support) implementation
32 
33 #ifndef _AKSIMD_GENERIC_H_
34 #define _AKSIMD_GENERIC_H_
35 
36 #include <math.h>
37 #include <string.h>
40 
41 ////////////////////////////////////////////////////////////////////////
42 /// @name AKSIMD types
43 //@{
44 typedef AkInt32 AKSIMD_I32; ///< 32-bit signed integer
45 typedef struct { AkInt32 m_data[4]; } AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
46 typedef struct { AkUInt32 m_data[4]; } AKSIMD_V4UI32; ///< Vector of 4 32-bit signed integers
47 typedef AkReal32 AKSIMD_F32; ///< 32-bit float
48 typedef struct { AkReal32 m_data[2]; } AKSIMD_V2F32; ///< Vector of 2 32-bit floats
49 typedef struct { AkReal32 m_data[4]; } AKSIMD_V4F32; ///< Vector of 4 32-bit floats
50 typedef AKSIMD_V4UI32 AKSIMD_V4COND; ///< Vector of 4 comparison results
51 
52 #pragma pack(push,1)
53 typedef struct { AkInt32 m_data[4]; } AKSIMD_V4I32_UNALIGNED; ///< Unaligned Vector of 4 32-bit signed integers
54 typedef struct { AkUInt32 m_data[4]; } AKSIMD_V4UI32_UNALIGNED; ///< Unaligned Vector of 4 32-bit signed integers
55 typedef struct { AkReal32 m_data[2]; } AKSIMD_V2F32_UNALIGNED; ///< Unaligned Vector of 2 32-bit floats
56 typedef struct { AkReal32 m_data[4]; } AKSIMD_V4F32_UNALIGNED; ///< Unaligned Vector of 4 32-bit floats
57 #pragma pack(pop)
58 
59 //@}
60 ////////////////////////////////////////////////////////////////////////
61 
62 #ifndef AKSIMD_GETELEMENT_V4F32
63 #define AKSIMD_GETELEMENT_V4F32( __vName, __num__ ) (__vName).m_data[(__num__)]
64 #endif
65 
66 #ifndef AKSIMD_GETELEMENT_V2F32
67 #define AKSIMD_GETELEMENT_V2F32( __vName, __num__ ) (__vName).m_data[(__num__)]
68 #endif
69 
70 #ifndef AKSIMD_GETELEMENT_V4I32
71 #define AKSIMD_GETELEMENT_V4I32( __vName, __num__ ) (__vName).m_data[(__num__)]
72 #endif
73 
74 ////////////////////////////////////////////////////////////////////////
75 /// @name Platform specific memory size alignment for allocation purposes
76 //@{
77 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
78 //@}
79 ////////////////////////////////////////////////////////////////////////
80 
81 ////////////////////////////////////////////////////////////////////////
82 /// @name AKSIMD loading / setting
83 //@{
84 #define AKSIMD_LOADU_V4I32( in_pData ) (*(in_pData))
85 
86 #define AKSIMD_LOADU_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
87 
88 #define AKSIMD_LOAD_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
89 
91 {
92  AKSIMD_V4F32 vector;
93  vector.m_data[0] = in_value;
94  vector.m_data[1] = in_value;
95  vector.m_data[2] = in_value;
96  vector.m_data[3] = in_value;
97 
98  return vector;
99 }
100 
101 // _mm_set_ps1
103 {
104  AKSIMD_V4F32 vector;
105  vector.m_data[0] = in_value;
106  vector.m_data[1] = in_value;
107  vector.m_data[2] = in_value;
108  vector.m_data[3] = in_value;
109 
110  return vector;
111 }
112 
113 
115 {
116  AKSIMD_V2F32 vector;
117  vector.m_data[0] = in_value;
118  vector.m_data[1] = in_value;
119 
120  return vector;
121 }
122 
123 // _mm_setzero_ps()
125 {
126  AKSIMD_V4F32 vector;
127  vector.m_data[0] = 0.f;
128  vector.m_data[1] = 0.f;
129  vector.m_data[2] = 0.f;
130  vector.m_data[3] = 0.f;
131 
132  return vector;
133 }
134 
136 {
137  AKSIMD_V2F32 vector;
138  vector.m_data[0] = 0.f;
139  vector.m_data[1] = 0.f;
140 
141  return vector;
142 }
143 // _mm_setzero_si128()
145 {
146  AKSIMD_V4I32 vector;
147  vector.m_data[0] = 0;
148  vector.m_data[1] = 0;
149  vector.m_data[2] = 0;
150  vector.m_data[3] = 0;
151 
152  return vector;
153 }
154 
155 
156 /// Loads a single-precision, floating-point value into the low word
157 /// and clears the upper three words.
158 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
160 {
161  AKSIMD_V4F32 vector;
162  vector.m_data[0] = *in_pData;
163  vector.m_data[1] = 0.f;
164  vector.m_data[2] = 0.f;
165  vector.m_data[3] = 0.f;
166 
167  return vector;
168 }
169 
170 //@}
171 ////////////////////////////////////////////////////////////////////////
172 
173 ////////////////////////////////////////////////////////////////////////
174 /// @name AKSIMD storing
175 //@{
176 
177 // _mm_storeu_ps -- The address does not need to be 16-byte aligned.
178 #define AKSIMD_STOREU_V4F32( in_pTo, in_vec ) (*(AKSIMD_V4F32*)(in_pTo)) = (in_vec)
179 
180 // _mm_store_ps -- The address must be 16-byte aligned.
181 // ????? _mm_storeu_ps vs _mm_store_ps ?????
182 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) AKSIMD_STOREU_V4F32(__addr__, __vName__)
183 
184 // _mm_storeu_si128
185 #define AKSIMD_STOREU_V4I32( in_pTo, in_vec ) (*(AKSIMD_V4I32*)(in_pTo)) = (in_vec)
186 
187 /// Stores the lower single-precision, floating-point value.
188 /// *p := a0 (see _mm_store_ss)
190 {
191  ((AKSIMD_V4F32*)in_pTo)->m_data[0] = in_vec.m_data[0];
192 }
193 
194 //@}
195 ////////////////////////////////////////////////////////////////////////
196 
197 ////////////////////////////////////////////////////////////////////////
198 /// @name AKSIMD conversion
199 //@{
200 
201 // _mm_cvtepi32_ps
203 {
204  AKSIMD_V4F32 vector;
205  vector.m_data[0] = (AkReal32)in_from.m_data[0];
206  vector.m_data[1] = (AkReal32)in_from.m_data[1];
207  vector.m_data[2] = (AkReal32)in_from.m_data[2];
208  vector.m_data[3] = (AkReal32)in_from.m_data[3];
209 
210  return vector;
211 }
212 // _mm_cvtps_epi32
214 {
215  AKSIMD_V4I32 vector;
216  vector.m_data[0] = (AkInt32)in_from.m_data[0];
217  vector.m_data[1] = (AkInt32)in_from.m_data[1];
218  vector.m_data[2] = (AkInt32)in_from.m_data[2];
219  vector.m_data[3] = (AkInt32)in_from.m_data[3];
220 
221  return vector;
222 }
223 
224 //@}
225 ////////////////////////////////////////////////////////////////////////
226 
227 ////////////////////////////////////////////////////////////////////////
228 /// @name AKSIMD logical operations
229 //@{
230 
231 // _mm_and_si128
233 {
234  AKSIMD_V4I32 vector;
235  vector.m_data[0] = in_vec1.m_data[0] & in_vec2.m_data[0];
236  vector.m_data[1] = in_vec1.m_data[1] & in_vec2.m_data[1];
237  vector.m_data[2] = in_vec1.m_data[2] & in_vec2.m_data[2];
238  vector.m_data[3] = in_vec1.m_data[3] & in_vec2.m_data[3];
239 
240  return vector;
241 }
242 
243 /// Compares the 8 signed 16-bit integers in a and the 8 signed
244 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
246 {
247  AKSIMD_V4I32 vector;
248 
249  AkInt16 *pVec1,*pVec2,*pVec3;
250  pVec1 = (AkInt16*)&in_vec1;
251  pVec2 = (AkInt16*)&in_vec2;
252  pVec3 = (AkInt16*)&vector;
253 
254  pVec3[0] = (pVec1[0] > pVec2[0]) ? 0xffff : 0x0;
255  pVec3[1] = (pVec1[1] > pVec2[1]) ? 0xffff : 0x0;
256  pVec3[2] = (pVec1[2] > pVec2[2]) ? 0xffff : 0x0;
257  pVec3[3] = (pVec1[3] > pVec2[3]) ? 0xffff : 0x0;
258  pVec3[4] = (pVec1[4] > pVec2[4]) ? 0xffff : 0x0;
259  pVec3[5] = (pVec1[5] > pVec2[5]) ? 0xffff : 0x0;
260  pVec3[6] = (pVec1[6] > pVec2[6]) ? 0xffff : 0x0;
261  pVec3[7] = (pVec1[7] > pVec2[7]) ? 0xffff : 0x0;
262 
263  return vector;
264 }
265 
266 /// Compares for less than or equal (see _mm_cmple_ps)
268 {
269  AKSIMD_V4UI32 vector;
270 
271  vector.m_data[0] = (in_vec1.m_data[0] <= in_vec2.m_data[0]) ? 0xffffffff : 0x0;
272  vector.m_data[1] = (in_vec1.m_data[1] <= in_vec2.m_data[1]) ? 0xffffffff : 0x0;
273  vector.m_data[2] = (in_vec1.m_data[2] <= in_vec2.m_data[2]) ? 0xffffffff : 0x0;
274  vector.m_data[3] = (in_vec1.m_data[3] <= in_vec2.m_data[3]) ? 0xffffffff : 0x0;
275 
276  return vector;
277 }
278 
280 {
281  AKSIMD_V4F32 vector;
282 
283  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] >= in_vec2.m_data[0]) ? 0xffffffff : 0x0);
284  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] >= in_vec2.m_data[1]) ? 0xffffffff : 0x0);
285  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] >= in_vec2.m_data[2]) ? 0xffffffff : 0x0);
286  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] >= in_vec2.m_data[3]) ? 0xffffffff : 0x0);
287 
288  return vector;
289 }
290 
292 {
293  AKSIMD_V4F32 vector;
294 
295  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] > in_vec2.m_data[0]) ? 0xffffffff : 0x0);
296  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] > in_vec2.m_data[1]) ? 0xffffffff : 0x0);
297  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] > in_vec2.m_data[2]) ? 0xffffffff : 0x0);
298  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] > in_vec2.m_data[3]) ? 0xffffffff : 0x0);
299 
300  return vector;
301 }
302 
304 {
305  AKSIMD_V4F32 vector;
306 
307  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] <= in_vec2.m_data[0]) ? 0xffffffff : 0x0);
308  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] <= in_vec2.m_data[1]) ? 0xffffffff : 0x0);
309  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] <= in_vec2.m_data[2]) ? 0xffffffff : 0x0);
310  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] <= in_vec2.m_data[3]) ? 0xffffffff : 0x0);
311 
312  return vector;
313 }
314 
316 {
317  AKSIMD_V4F32 vector;
318 
319  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] < in_vec2.m_data[0]) ? 0xffffffff : 0x0);
320  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] < in_vec2.m_data[1]) ? 0xffffffff : 0x0);
321  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] < in_vec2.m_data[2]) ? 0xffffffff : 0x0);
322  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] < in_vec2.m_data[3]) ? 0xffffffff : 0x0);
323 
324  return vector;
325 }
326 
328 {
329  AKSIMD_V4F32 vector;
330 
331  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] == in_vec2.m_data[0]) ? 0xffffffff : 0x0);
332  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] == in_vec2.m_data[1]) ? 0xffffffff : 0x0);
333  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] == in_vec2.m_data[2]) ? 0xffffffff : 0x0);
334  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] == in_vec2.m_data[3]) ? 0xffffffff : 0x0);
335 
336  return vector;
337 }
338 
340 {
341  AKSIMD_V4F32 vector;
342 
343  vector.m_data[0] = (AkReal32)(((AkUInt32)in_vec1.m_data[0]) ^ ((AkUInt32)in_vec2.m_data[0]));
344  vector.m_data[1] = (AkReal32)(((AkUInt32)in_vec1.m_data[1]) ^ ((AkUInt32)in_vec2.m_data[1]));
345  vector.m_data[2] = (AkReal32)(((AkUInt32)in_vec1.m_data[2]) ^ ((AkUInt32)in_vec2.m_data[2]));
346  vector.m_data[3] = (AkReal32)(((AkUInt32)in_vec1.m_data[3]) ^ ((AkUInt32)in_vec2.m_data[3]));
347 
348  return vector;
349 }
350 
352 {
353  in_vector.m_data[0] <<= in_shiftBy;
354  in_vector.m_data[1] <<= in_shiftBy;
355  in_vector.m_data[2] <<= in_shiftBy;
356  in_vector.m_data[3] <<= in_shiftBy;
357 
358  return in_vector;
359 }
360 
362 {
363  in_vector.m_data[0] >>= in_shiftBy;
364  in_vector.m_data[1] >>= in_shiftBy;
365  in_vector.m_data[2] >>= in_shiftBy;
366  in_vector.m_data[3] >>= in_shiftBy;
367 
368  return in_vector;
369 }
370 
371 //@}
372 ////////////////////////////////////////////////////////////////////////
373 
374 
375 ////////////////////////////////////////////////////////////////////////
376 /// @name AKSIMD arithmetic
377 //@{
378 // _mm_sub_ps
380 {
381  AKSIMD_V4F32 vector;
382 
383  vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
384  vector.m_data[1] = in_vec1.m_data[1] - in_vec2.m_data[1];
385  vector.m_data[2] = in_vec1.m_data[2] - in_vec2.m_data[2];
386  vector.m_data[3] = in_vec1.m_data[3] - in_vec2.m_data[3];
387 
388  return vector;
389 }
390 
391 /// Subtracts the lower single-precision, floating-point values of a and b.
392 /// The upper three single-precision, floating-point values are passed through from a.
393 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
394 
396 {
397  AKSIMD_V4F32 vector;
398 
399  vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
400  vector.m_data[1] = in_vec1.m_data[1];
401  vector.m_data[2] = in_vec1.m_data[2];
402  vector.m_data[3] = in_vec1.m_data[3];
403 
404  return vector;
405 }
406 
407 // _mm_add_ps
409 {
410  AKSIMD_V4F32 vector;
411 
412  vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
413  vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
414  vector.m_data[2] = in_vec1.m_data[2] + in_vec2.m_data[2];
415  vector.m_data[3] = in_vec1.m_data[3] + in_vec2.m_data[3];
416 
417  return vector;
418 }
419 
421 {
422  AKSIMD_V4F32 vector;
423 
424  vector.m_data[0] = in_vec1.m_data[0] / in_vec2.m_data[0];
425  vector.m_data[1] = in_vec1.m_data[1] / in_vec2.m_data[1];
426  vector.m_data[2] = in_vec1.m_data[2] / in_vec2.m_data[2];
427  vector.m_data[3] = in_vec1.m_data[3] / in_vec2.m_data[3];
428 
429  return vector;
430 }
431 
433 {
434  AKSIMD_V2F32 vector;
435 
436  vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
437  vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
438 
439  return vector;
440 }
441 
442 /// Adds the lower single-precision, floating-point values of a and b; the
443 /// upper three single-precision, floating-point values are passed through from a.
444 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
446 {
447  AKSIMD_V4F32 vector;
448 
449  vector.m_data[0] = a.m_data[0] + b.m_data[0];
450  vector.m_data[1] = a.m_data[1];
451  vector.m_data[2] = a.m_data[2];
452  vector.m_data[3] = a.m_data[3];
453 
454  return vector;
455 }
456 
457 // _mm_mul_ps
459 {
460  AKSIMD_V4F32 vector;
461 
462  vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
463  vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
464  vector.m_data[2] = in_vec1.m_data[2] * in_vec2.m_data[2];
465  vector.m_data[3] = in_vec1.m_data[3] * in_vec2.m_data[3];
466 
467  return vector;
468 }
469 
471 {
472  AKSIMD_V2F32 vector;
473 
474  vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
475  vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
476 
477  return vector;
478 }
479 
480 /// Multiplies the lower single-precision, floating-point values of
481 /// a and b; the upper three single-precision, floating-point values
482 /// are passed through from a.
483 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
485 {
486  AKSIMD_V4F32 vector;
487 
488  vector.m_data[0] = a.m_data[0] * b.m_data[0];
489  vector.m_data[1] = a.m_data[1];
490  vector.m_data[2] = a.m_data[2];
491  vector.m_data[3] = a.m_data[3];
492 
493  return vector;
494 }
495 
496 /// Vector multiply-add operation.
497 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
498 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
499 
500 /// Vector multiply-add operation.
501 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( (__a__), (__b__) ), (__c__) )
502 
503 // _mm_min_ps
505 {
506  AKSIMD_V4F32 vector;
507 
508  vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
509  vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
510  vector.m_data[2] = AkMin(in_vec1.m_data[2], in_vec2.m_data[2]);
511  vector.m_data[3] = AkMin(in_vec1.m_data[3], in_vec2.m_data[3]);
512 
513  return vector;
514 }
515 
517 {
518  AKSIMD_V2F32 vector;
519 
520  vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
521  vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
522 
523  return vector;
524 }
525 
526 // _mm_max_ps
528 {
529  AKSIMD_V4F32 vector;
530 
531  vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
532  vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
533  vector.m_data[2] = AkMax(in_vec1.m_data[2], in_vec2.m_data[2]);
534  vector.m_data[3] = AkMax(in_vec1.m_data[3], in_vec2.m_data[3]);
535 
536  return vector;
537 }
538 
540 {
541  AKSIMD_V2F32 vector;
542 
543  vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
544  vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
545 
546  return vector;
547 }
548 
550 {
551  AKSIMD_V4F32 vector;
552  vector.m_data[0] = fabsf(in_vec1.m_data[0]);
553  vector.m_data[1] = fabsf(in_vec1.m_data[1]);
554  vector.m_data[2] = fabsf(in_vec1.m_data[2]);
555  vector.m_data[3] = fabsf(in_vec1.m_data[3]);
556  return vector;
557 }
558 
560 {
561  AKSIMD_V4F32 vector;
562  vector.m_data[0] = -in_vec1.m_data[0];
563  vector.m_data[1] = -in_vec1.m_data[1];
564  vector.m_data[2] = -in_vec1.m_data[2];
565  vector.m_data[3] = -in_vec1.m_data[3];
566  return vector;
567 }
568 
569 // _mm_sqrt_ps
571 {
572  AKSIMD_V4F32 vCompare;
573  AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
574  AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
575  AKSIMD_GETELEMENT_V4F32(vCompare,2) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,2) );
576  AKSIMD_GETELEMENT_V4F32(vCompare,3) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,3) );
577 
578  //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
579 
580  return vCompare /*res*/;
581 }
582 
583 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
585 {
586  AKSIMD_V4F32 vCompare;
587  AKSIMD_GETELEMENT_V4F32(vCompare, 0) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 0));
588  AKSIMD_GETELEMENT_V4F32(vCompare, 1) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 1));
589  AKSIMD_GETELEMENT_V4F32(vCompare, 2) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 2));
590  AKSIMD_GETELEMENT_V4F32(vCompare, 3) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 3));
591 
592  return vCompare;
593 }
594 
596 {
597  AKSIMD_V2F32 vCompare;
598  AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
599  AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
600 
601  //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
602 
603  return vCompare /*res*/;
604 }
605 
606 //@}
607 ////////////////////////////////////////////////////////////////////////
608 
609 
610 ////////////////////////////////////////////////////////////////////////
611 /// @name AKSIMD packing / unpacking
612 //@{
613 
614 //
615 // _mm_unpacklo_epi16
616 // r0 := a0
617 // r1 := b0
618 // r2 := a1
619 // r3 := b1
620 // r4 := a2
621 // r5 := b2
622 // r6 := a3
623 // r7 := b3
625 {
626  AKSIMD_V4I32 vector;
627  AkInt16 *pVec1,*pVec2,*pDest;
628  pVec1 = (AkInt16*)&in_vec1;
629  pVec2 = (AkInt16*)&in_vec2;
630  pDest = (AkInt16*)&vector;
631 
632  pDest[0] = pVec1[0];
633  pDest[1] = pVec2[0];
634  pDest[2] = pVec1[1];
635  pDest[3] = pVec2[1];
636  pDest[4] = pVec1[2];
637  pDest[5] = pVec2[2];
638  pDest[6] = pVec1[3];
639  pDest[7] = pVec2[3];
640 
641  return vector;
642 }
643 
644 // _mm_unpackhi_epi16
646 {
647  AKSIMD_V4I32 vector;
648  AkInt16 *pVec1,*pVec2,*pDest;
649  pVec1 = (AkInt16*)&in_vec1;
650  pVec2 = (AkInt16*)&in_vec2;
651  pDest = (AkInt16*)&vector;
652 
653  pDest[0] = pVec1[4];
654  pDest[1] = pVec2[4];
655  pDest[2] = pVec1[5];
656  pDest[3] = pVec2[5];
657  pDest[4] = pVec1[6];
658  pDest[5] = pVec2[6];
659  pDest[6] = pVec1[7];
660  pDest[7] = pVec2[7];
661 
662  return vector;
663 }
664 
665 // _mm_unpacklo_ps
667 {
668  AKSIMD_V4F32 vector;
669  vector.m_data[0] = in_vec1.m_data[0];
670  vector.m_data[1] = in_vec2.m_data[0];
671  vector.m_data[2] = in_vec1.m_data[1];
672  vector.m_data[3] = in_vec2.m_data[1];
673 
674  return vector;
675 }
676 
677 // _mm_unpackhi_ps
679 {
680  AKSIMD_V4F32 vector;
681  vector.m_data[0] = in_vec1.m_data[2];
682  vector.m_data[1] = in_vec2.m_data[2];
683  vector.m_data[2] = in_vec1.m_data[3];
684  vector.m_data[3] = in_vec2.m_data[3];
685 
686  return vector;
687 }
688 
689 // _mm_packs_epi32
691 {
692  AKSIMD_V4I32 vector;
693  AkInt16 *pDest = (AkInt16*)&vector;
694 
695  pDest[0] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[0], -32768, 32767);
696  pDest[1] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[1], -32768, 32767);
697  pDest[2] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[2], -32768, 32767);
698  pDest[3] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[3], -32768, 32767);
699  pDest[4] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[0], -32768, 32767);
700  pDest[5] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[1], -32768, 32767);
701  pDest[6] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[2], -32768, 32767);
702  pDest[7] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[3], -32768, 32767);
703 
704  return vector;
705 }
706 
707 //@}
708 ////////////////////////////////////////////////////////////////////////
709 
710 
711 //#define AKSIMD_GET_ITEM( vec, index ) vec[index]
712 
713 
714 
715 
716 ////////////////////////////////////////////////////////////////////////
717 /// @name AKSIMD shuffling
718 //@{
719 
720 // See _MM_SHUFFLE
721 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
722  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
723 
724 // See _mm_shuffle_ps
725 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
726 //#define AKSIMD_SHUFFLE_V4F32( a, b, zyxw )
727 
729 {
730  AKSIMD_V4F32 vector;
731  vector.m_data[0] = xyzw.m_data[(mask) & 0x3];
732  vector.m_data[1] = xyzw.m_data[(mask >> 2) & 0x3];
733  vector.m_data[2] = abcd.m_data[(mask >> 4) & 0x3];
734  vector.m_data[3] = abcd.m_data[(mask >> 6) & 0x3];
735 
736  return vector;
737 }
738 
739 
740 /// Moves the upper two single-precision, floating-point values of b to
741 /// the lower two single-precision, floating-point values of the result.
742 /// The upper two single-precision, floating-point values of a are passed
743 /// through to the result.
744 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
745 #define AKSIMD_MOVEHL_V4F32( a, b ) \
746  AKSIMD_SHUFFLE_V4F32( (b), (a), AKSIMD_SHUFFLE(3, 2, 3, 2) )
747 
748 /// Moves the lower two single-precision, floating-point values of b to
749 /// the upper two single-precision, floating-point values of the result.
750 /// The lower two single-precision, floating-point values of a are passed
751 /// through to the result.
752 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
753 #define AKSIMD_MOVELH_V4F32( a, b ) \
754  AKSIMD_SHUFFLE_V4F32( (a), (b), AKSIMD_SHUFFLE(1, 0, 1, 0) )
755 
756 /// Swap the 2 lower floats together and the 2 higher floats together.
757 #define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1));
758 
759 /// Swap the 2 lower floats with the 2 higher floats.
760 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2));
761 
762 /// Barrel-shift all floats by one.
763 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
764 
765  /// Duplicates the odd items into the even items (d c b a -> d d b b )
766 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
767 
768  /// Duplicates the even items into the odd items (d c b a -> c c a a )
769 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
770 
771 
772 //#include <AK/SoundEngine/Platforms/Generic/AkSimdShuffle.h>
773 
774 //@}
775 ////////////////////////////////////////////////////////////////////////
776 
777 // Old AKSIMD -- will search-and-replace later
778 #define AkReal32Vector AKSIMD_V4F32
779 #define AKSIMD_LOAD1( __scalar__ ) AKSIMD_LOAD1_V4F32( &__scalar__ )
780 #define AKSIMD_LOADVEC(v) AKSIMD_LOAD_V4F32((const AKSIMD_F32*)((v)))
781 #define AKSIMD_MUL AKSIMD_MUL_V4F32
782 #define AKSIMD_STOREVEC AKSIMD_STORE_V4F32
783 
784 /// Faked in-place vector horizontal add.
785 /// \akwarning
786 /// Don't expect this to be very efficient.
787 /// \endakwarning
789 {
790  AKSIMD_V4F32 vHighLow = AKSIMD_MOVEHL_V4F32(vVec, vVec);
791  vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
792  vHighLow = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0x55);
793  vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
794 }
795 
796 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
798 {
799  static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f };
800 
801  AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
802  vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
803  AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
804  vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
805  vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
806  vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 );
807  vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
808  return vTmp2;
809 }
810 
811 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
812 
813 #define AK_SIGN_BIT( val ) (((AkUInt32)val) >> 31)
814 
815 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec )
816 {
817  return AK_SIGN_BIT(in_vec.m_data[0]) | AK_SIGN_BIT(in_vec.m_data[1]) << 1 | AK_SIGN_BIT(in_vec.m_data[2]) << 2 | AK_SIGN_BIT(in_vec.m_data[3]) << 3;
818 }
819 
820 #endif //_AKSIMD_GENERIC_H_
821 
AkForceInline AKSIMD_V2F32 AKSIMD_SET_V2F32(AKSIMD_F32 in_value)
Definition: AkSimd.h:114
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:74
AkForceInline AKSIMD_V4F32 AKSIMD_ADD_SS_V4F32(const AKSIMD_V4F32 &a, const AKSIMD_V4F32 &b)
Definition: AkSimd.h:445
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4F32 &in_vec)
Definition: AkSimd.h:815
AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTLEFT_V4I32(AKSIMD_V4I32 in_vector, int in_shiftBy)
Definition: AkSimd.h:351
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:554
AkForceInline AKSIMD_V4F32 AKSIMD_SHUFFLE_V4F32(const AKSIMD_V4F32 &xyzw, const AKSIMD_V4F32 &abcd, int mask)
Definition: AkSimd.h:728
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:393
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:78
int16_t AkInt16
Signed 16-bit integer.
Definition: AkTypes.h:90
AkForceInline AKSIMD_V4F32 AKSIMD_LTEQ_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:303
AkForceInline AKSIMD_V4F32 AKSIMD_SUB_SS_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:395
#define AKSIMD_GETELEMENT_V4F32(__vName, __num__)
Definition: AkSimd.h:63
AkForceInline AKSIMD_V4F32 AKSIMD_MIN_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:504
AkReal32 m_data[4]
Definition: AkSimd.h:49
AkForceInline AKSIMD_V4F32 AKSIMD_SUB_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:379
AkForceInline AKSIMD_V4F32 AKSIMD_MUL_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:458
AkForceInline AKSIMD_V4F32 AKSIMD_NEG_V4F32(const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:559
#define AkClamp(x, min, max)
Definition: AkPlatformFuncs.h:95
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:566
AkReal32 m_data[2]
Definition: AkSimd.h:48
AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4I32_TO_V4F32(const AKSIMD_V4I32 &in_from)
Definition: AkSimd.h:202
AkForceInline AKSIMD_V4F32 AKSIMD_ABS_V4F32(const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:549
AkForceInline AKSIMD_V4F32 AKSIMD_GTEQ_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:279
AkForceInline AKSIMD_V4I32 AKSIMD_AND_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:232
AkInt32 AKSIMD_I32
32-bit signed integer
Definition: AkSimd.h:44
AkForceInline AKSIMD_V2F32 AKSIMD_MAX_V2F32(const AKSIMD_V2F32 &in_vec1, const AKSIMD_V2F32 &in_vec2)
Definition: AkSimd.h:539
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:76
AkForceInline AKSIMD_V4I32 AKSIMD_CMPGT_V8I16(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:245
AkForceInline AKSIMD_V2F32 AKSIMD_MIN_V2F32(const AKSIMD_V2F32 &in_vec1, const AKSIMD_V2F32 &in_vec2)
Definition: AkSimd.h:516
AkForceInline AKSIMD_V4F32 AKSIMD_SETZERO_V4F32()
Definition: AkSimd.h:124
#define AkForceInline
Force inlining.
Definition: AkTypes.h:62
AkForceInline AKSIMD_V4F32 AKSIMD_MUL_SS_V4F32(const AKSIMD_V4F32 &a, const AKSIMD_V4F32 &b)
Definition: AkSimd.h:484
AkForceInline AKSIMD_V4F32 AKSIMD_SQRT_V4F32(const AKSIMD_V4F32 &in_vec)
Definition: AkSimd.h:570
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:68
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:578
int32_t AkInt32
Signed 32-bit integer.
Definition: AkTypes.h:91
AkForceInline AKSIMD_V4F32 AKSIMD_RSQRT_V4F32(const AKSIMD_V4F32 &in_vec)
Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
Definition: AkSimd.h:584
AkForceInline AKSIMD_V2F32 AKSIMD_SQRT_V2F32(const AKSIMD_V2F32 &in_vec)
Definition: AkSimd.h:595
AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKHI_VECTOR8I16(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:645
AkForceInline void AKSIMD_STORE1_V4F32(AKSIMD_F32 *in_pTo, const AKSIMD_V4F32 &in_vec)
Definition: AkSimd.h:189
#define AkMin(x1, x2)
Definition: AkPlatformFuncs.h:94
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:721
AkForceInline AKSIMD_V2F32 AKSIMD_MUL_V2F32(const AKSIMD_V2F32 &in_vec1, const AKSIMD_V2F32 &in_vec2)
Definition: AkSimd.h:470
AkForceInline AKSIMD_V4F32 AKSIMD_EQ_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:327
AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4I32(const AKSIMD_V4F32 &in_from)
Definition: AkSimd.h:213
AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKLO_VECTOR8I16(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:624
uint32x4_t AKSIMD_V4UI32
Vector of 4 32-bit unsigned signed integers.
Definition: AkSimd.h:71
AkForceInline AKSIMD_V4F32 AKSIMD_MAX_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:527
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:75
AkForceInline AKSIMD_V4F32 AKSIMD_SET_V4F32(AKSIMD_F32 in_value)
Definition: AkSimd.h:102
AkForceInline AKSIMD_V4UI32 AKSIMD_CMPLE_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Compares for less than or equal (see _mm_cmple_ps)
Definition: AkSimd.h:267
AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:339
AkForceInline AKSIMD_V4I32 AKSIMD_SETZERO_V4I32()
Definition: AkSimd.h:144
#define AKSIMD_SHUFFLE_BADC(__a__)
Swap the 2 lower floats together and the 2 higher floats together.
Definition: AkSimd.h:757
AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTRIGHTARITH_V4I32(AKSIMD_V4I32 in_vector, int in_shiftBy)
Definition: AkSimd.h:361
AkForceInline AKSIMD_V2F32 AKSIMD_ADD_V2F32(const AKSIMD_V2F32 &in_vec1, const AKSIMD_V2F32 &in_vec2)
Definition: AkSimd.h:432
AkForceInline AKSIMD_V4F32 AKSIMD_ADD_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:408
#define AkMax(x1, x2)
Definition: AkPlatformFuncs.h:93
uint32_t AkUInt32
Unsigned 32-bit integer.
Definition: AkTypes.h:78
#define AK_SIGN_BIT(val)
Definition: AkSimd.h:813
AkInt32 m_data[4]
Definition: AkSimd.h:45
#define AKSIMD_MOVEHL_V4F32(a, b)
Definition: AkSimd.h:745
AkForceInline AKSIMD_V4F32 AKSIMD_LOAD_SS_V4F32(const AKSIMD_F32 *in_pData)
Definition: AkSimd.h:159
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL(const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:797
AkForceInline AKSIMD_V4F32 AKSIMD_GT_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:291
AkForceInline AKSIMD_V4F32 AKSIMD_LT_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:315
float AkReal32
32-bit floating point
Definition: AkTypes.h:96
AkForceInline AKSIMD_V4F32 AKSIMD_LOAD1_V4F32(AKSIMD_F32 in_value)
Definition: AkSimd.h:90
AkUInt32 m_data[4]
Definition: AkSimd.h:46
AkForceInline AKSIMD_V2F32 AKSIMD_SETZERO_V2F32()
Definition: AkSimd.h:135
static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 &vVec)
Definition: AkSimd.h:788