Table of Contents

Wwise SDK 2019.2.6
AkSimd.h
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: <VERSION> Build: <BUILDNUMBER>
25  Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - Generic (no SIMD support) implementation
32 
33 #ifndef _AKSIMD_GENERIC_H_
34 #define _AKSIMD_GENERIC_H_
35 
36 #include <math.h>
37 #include <string.h>
38 #include <AK/SoundEngine/Common/AkTypes.h>
39 #include <AK/Tools/Common/AkPlatformFuncs.h>
40 
41 ////////////////////////////////////////////////////////////////////////
42 /// @name AKSIMD types
43 //@{
44 typedef AkInt32 AKSIMD_I32; ///< 32-bit signed integer
45 typedef struct { AkInt32 m_data[4]; } AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
46 typedef struct { AkUInt32 m_data[4]; } AKSIMD_V4UI32; ///< Vector of 4 32-bit signed integers
47 typedef AkReal32 AKSIMD_F32; ///< 32-bit float
48 typedef struct { AkReal32 m_data[2]; } AKSIMD_V2F32; ///< Vector of 2 32-bit floats
49 typedef struct { AkReal32 m_data[4]; } AKSIMD_V4F32; ///< Vector of 4 32-bit floats
50 typedef AKSIMD_V4UI32 AKSIMD_V4COND; ///< Vector of 4 comparison results
51 
52 #pragma pack(push,1)
53 typedef struct { AkInt32 m_data[4]; } AKSIMD_V4I32_UNALIGNED; ///< Unaligned Vector of 4 32-bit signed integers
54 typedef struct { AkUInt32 m_data[4]; } AKSIMD_V4UI32_UNALIGNED; ///< Unaligned Vector of 4 32-bit signed integers
55 typedef struct { AkReal32 m_data[2]; } AKSIMD_V2F32_UNALIGNED; ///< Unaligned Vector of 2 32-bit floats
56 typedef struct { AkReal32 m_data[4]; } AKSIMD_V4F32_UNALIGNED; ///< Unaligned Vector of 4 32-bit floats
57 #pragma pack(pop)
58 
59 //@}
60 ////////////////////////////////////////////////////////////////////////
61 
62 ////////////////////////////////////////////////////////////////////////
63 /// @name Platform specific defines for prefetching
64 //@{
65 
66 #define AKSIMD_ARCHCACHELINESIZE (32) ///< Assumed cache line width for architectures on this platform
67 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
68 /// Cross-platform memory prefetch of effective address assuming non-temporal data
69 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ )
70 
71 //@}
72 ////////////////////////////////////////////////////////////////////////
73 
74 #ifndef AKSIMD_GETELEMENT_V4F32
75 #define AKSIMD_GETELEMENT_V4F32( __vName, __num__ ) (__vName).m_data[(__num__)]
76 #endif
77 
78 #ifndef AKSIMD_GETELEMENT_V2F32
79 #define AKSIMD_GETELEMENT_V2F32( __vName, __num__ ) (__vName).m_data[(__num__)]
80 #endif
81 
82 #ifndef AKSIMD_GETELEMENT_V4I32
83 #define AKSIMD_GETELEMENT_V4I32( __vName, __num__ ) (__vName).m_data[(__num__)]
84 #endif
85 
86 ////////////////////////////////////////////////////////////////////////
87 /// @name Platform specific memory size alignment for allocation purposes
88 //@{
89 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
90 //@}
91 ////////////////////////////////////////////////////////////////////////
92 
93 ////////////////////////////////////////////////////////////////////////
94 /// @name AKSIMD loading / setting
95 //@{
96 #define AKSIMD_LOADU_V4I32( in_pData ) (*(in_pData))
97 
98 #define AKSIMD_LOADU_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
99 
100 #define AKSIMD_LOAD_V4F32( in_pValue ) (*(AKSIMD_V4F32*)(in_pValue))
101 
102 AkForceInline AKSIMD_V4F32 AKSIMD_LOAD1_V4F32( AKSIMD_F32 in_value )
103 {
104  AKSIMD_V4F32 vector;
105  vector.m_data[0] = in_value;
106  vector.m_data[1] = in_value;
107  vector.m_data[2] = in_value;
108  vector.m_data[3] = in_value;
109 
110  return vector;
111 }
112 
113 // _mm_set_ps1
114 AkForceInline AKSIMD_V4F32 AKSIMD_SET_V4F32( AKSIMD_F32 in_value )
115 {
116  AKSIMD_V4F32 vector;
117  vector.m_data[0] = in_value;
118  vector.m_data[1] = in_value;
119  vector.m_data[2] = in_value;
120  vector.m_data[3] = in_value;
121 
122  return vector;
123 }
124 
125 
126 AkForceInline AKSIMD_V2F32 AKSIMD_SET_V2F32( AKSIMD_F32 in_value )
127 {
128  AKSIMD_V2F32 vector;
129  vector.m_data[0] = in_value;
130  vector.m_data[1] = in_value;
131 
132  return vector;
133 }
134 
135 // _mm_setzero_ps()
136 AkForceInline AKSIMD_V4F32 AKSIMD_SETZERO_V4F32()
137 {
138  AKSIMD_V4F32 vector;
139  vector.m_data[0] = 0.f;
140  vector.m_data[1] = 0.f;
141  vector.m_data[2] = 0.f;
142  vector.m_data[3] = 0.f;
143 
144  return vector;
145 }
146 
147 AkForceInline AKSIMD_V2F32 AKSIMD_SETZERO_V2F32()
148 {
149  AKSIMD_V2F32 vector;
150  vector.m_data[0] = 0.f;
151  vector.m_data[1] = 0.f;
152 
153  return vector;
154 }
155 // _mm_setzero_si128()
156 AkForceInline AKSIMD_V4I32 AKSIMD_SETZERO_V4I32()
157 {
158  AKSIMD_V4I32 vector;
159  vector.m_data[0] = 0;
160  vector.m_data[1] = 0;
161  vector.m_data[2] = 0;
162  vector.m_data[3] = 0;
163 
164  return vector;
165 }
166 
167 
168 /// Loads a single-precision, floating-point value into the low word
169 /// and clears the upper three words.
170 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
171 AkForceInline AKSIMD_V4F32 AKSIMD_LOAD_SS_V4F32( const AKSIMD_F32* in_pData )
172 {
173  AKSIMD_V4F32 vector;
174  vector.m_data[0] = *in_pData;
175  vector.m_data[1] = 0.f;
176  vector.m_data[2] = 0.f;
177  vector.m_data[3] = 0.f;
178 
179  return vector;
180 }
181 
182 //@}
183 ////////////////////////////////////////////////////////////////////////
184 
185 ////////////////////////////////////////////////////////////////////////
186 /// @name AKSIMD storing
187 //@{
188 
189 // _mm_storeu_ps -- The address does not need to be 16-byte aligned.
190 #define AKSIMD_STOREU_V4F32( in_pTo, in_vec ) (*(AKSIMD_V4F32*)(in_pTo)) = (in_vec)
191 
192 // _mm_store_ps -- The address must be 16-byte aligned.
193 // ????? _mm_storeu_ps vs _mm_store_ps ?????
194 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) AKSIMD_STOREU_V4F32(__addr__, __vName__)
195 
196 // _mm_storeu_si128
197 #define AKSIMD_STOREU_V4I32( in_pTo, in_vec ) (*(AKSIMD_V4I32*)(in_pTo)) = (in_vec)
198 
199 /// Stores the lower single-precision, floating-point value.
200 /// *p := a0 (see _mm_store_ss)
201 AkForceInline void AKSIMD_STORE1_V4F32( AKSIMD_F32* in_pTo, const AKSIMD_V4F32& in_vec )
202 {
203  ((AKSIMD_V4F32*)in_pTo)->m_data[0] = in_vec.m_data[0];
204 }
205 
206 //@}
207 ////////////////////////////////////////////////////////////////////////
208 
209 ////////////////////////////////////////////////////////////////////////
210 /// @name AKSIMD conversion
211 //@{
212 
213 // _mm_cvtepi32_ps
214 AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4I32_TO_V4F32( const AKSIMD_V4I32& in_from )
215 {
216  AKSIMD_V4F32 vector;
217  vector.m_data[0] = (AkReal32)in_from.m_data[0];
218  vector.m_data[1] = (AkReal32)in_from.m_data[1];
219  vector.m_data[2] = (AkReal32)in_from.m_data[2];
220  vector.m_data[3] = (AkReal32)in_from.m_data[3];
221 
222  return vector;
223 }
224 // _mm_cvtps_epi32
225 AkForceInline AKSIMD_V4I32 AKSIMD_TRUNCATE_V4F32_TO_V4I32( const AKSIMD_V4F32& in_from )
226 {
227  AKSIMD_V4I32 vector;
228  vector.m_data[0] = (AkInt32)in_from.m_data[0];
229  vector.m_data[1] = (AkInt32)in_from.m_data[1];
230  vector.m_data[2] = (AkInt32)in_from.m_data[2];
231  vector.m_data[3] = (AkInt32)in_from.m_data[3];
232 
233  return vector;
234 }
235 
236 //@}
237 ////////////////////////////////////////////////////////////////////////
238 
239 ////////////////////////////////////////////////////////////////////////
240 /// @name AKSIMD logical operations
241 //@{
242 
243 // _mm_and_si128
244 AkForceInline AKSIMD_V4I32 AKSIMD_AND_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
245 {
246  AKSIMD_V4I32 vector;
247  vector.m_data[0] = in_vec1.m_data[0] & in_vec2.m_data[0];
248  vector.m_data[1] = in_vec1.m_data[1] & in_vec2.m_data[1];
249  vector.m_data[2] = in_vec1.m_data[2] & in_vec2.m_data[2];
250  vector.m_data[3] = in_vec1.m_data[3] & in_vec2.m_data[3];
251 
252  return vector;
253 }
254 
255 /// Compares the 8 signed 16-bit integers in a and the 8 signed
256 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
257 AkForceInline AKSIMD_V4I32 AKSIMD_CMPGT_V8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
258 {
259  AKSIMD_V4I32 vector;
260 
261  AkInt16 *pVec1,*pVec2,*pVec3;
262  pVec1 = (AkInt16*)&in_vec1;
263  pVec2 = (AkInt16*)&in_vec2;
264  pVec3 = (AkInt16*)&vector;
265 
266  pVec3[0] = (pVec1[0] > pVec2[0]) ? 0xffff : 0x0;
267  pVec3[1] = (pVec1[1] > pVec2[1]) ? 0xffff : 0x0;
268  pVec3[2] = (pVec1[2] > pVec2[2]) ? 0xffff : 0x0;
269  pVec3[3] = (pVec1[3] > pVec2[3]) ? 0xffff : 0x0;
270  pVec3[4] = (pVec1[4] > pVec2[4]) ? 0xffff : 0x0;
271  pVec3[5] = (pVec1[5] > pVec2[5]) ? 0xffff : 0x0;
272  pVec3[6] = (pVec1[6] > pVec2[6]) ? 0xffff : 0x0;
273  pVec3[7] = (pVec1[7] > pVec2[7]) ? 0xffff : 0x0;
274 
275  return vector;
276 }
277 
278 /// Compares for less than or equal (see _mm_cmple_ps)
279 AkForceInline AKSIMD_V4UI32 AKSIMD_CMPLE_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
280 {
281  AKSIMD_V4UI32 vector;
282 
283  vector.m_data[0] = (in_vec1.m_data[0] <= in_vec2.m_data[0]) ? 0xffffffff : 0x0;
284  vector.m_data[1] = (in_vec1.m_data[1] <= in_vec2.m_data[1]) ? 0xffffffff : 0x0;
285  vector.m_data[2] = (in_vec1.m_data[2] <= in_vec2.m_data[2]) ? 0xffffffff : 0x0;
286  vector.m_data[3] = (in_vec1.m_data[3] <= in_vec2.m_data[3]) ? 0xffffffff : 0x0;
287 
288  return vector;
289 }
290 
291 AkForceInline AKSIMD_V4F32 AKSIMD_GTEQ_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
292 {
293  AKSIMD_V4F32 vector;
294 
295  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] >= in_vec2.m_data[0]) ? 0xffffffff : 0x0);
296  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] >= in_vec2.m_data[1]) ? 0xffffffff : 0x0);
297  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] >= in_vec2.m_data[2]) ? 0xffffffff : 0x0);
298  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] >= in_vec2.m_data[3]) ? 0xffffffff : 0x0);
299 
300  return vector;
301 }
302 
303 AkForceInline AKSIMD_V4F32 AKSIMD_GT_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
304 {
305  AKSIMD_V4F32 vector;
306 
307  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] > in_vec2.m_data[0]) ? 0xffffffff : 0x0);
308  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] > in_vec2.m_data[1]) ? 0xffffffff : 0x0);
309  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] > in_vec2.m_data[2]) ? 0xffffffff : 0x0);
310  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] > in_vec2.m_data[3]) ? 0xffffffff : 0x0);
311 
312  return vector;
313 }
314 
315 AkForceInline AKSIMD_V4F32 AKSIMD_LTEQ_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
316 {
317  AKSIMD_V4F32 vector;
318 
319  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] <= in_vec2.m_data[0]) ? 0xffffffff : 0x0);
320  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] <= in_vec2.m_data[1]) ? 0xffffffff : 0x0);
321  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] <= in_vec2.m_data[2]) ? 0xffffffff : 0x0);
322  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] <= in_vec2.m_data[3]) ? 0xffffffff : 0x0);
323 
324  return vector;
325 }
326 
327 AkForceInline AKSIMD_V4F32 AKSIMD_LT_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
328 {
329  AKSIMD_V4F32 vector;
330 
331  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] < in_vec2.m_data[0]) ? 0xffffffff : 0x0);
332  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] < in_vec2.m_data[1]) ? 0xffffffff : 0x0);
333  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] < in_vec2.m_data[2]) ? 0xffffffff : 0x0);
334  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] < in_vec2.m_data[3]) ? 0xffffffff : 0x0);
335 
336  return vector;
337 }
338 
339 AkForceInline AKSIMD_V4F32 AKSIMD_EQ_V4F32(const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2)
340 {
341  AKSIMD_V4F32 vector;
342 
343  vector.m_data[0] = (AkReal32)((in_vec1.m_data[0] == in_vec2.m_data[0]) ? 0xffffffff : 0x0);
344  vector.m_data[1] = (AkReal32)((in_vec1.m_data[1] == in_vec2.m_data[1]) ? 0xffffffff : 0x0);
345  vector.m_data[2] = (AkReal32)((in_vec1.m_data[2] == in_vec2.m_data[2]) ? 0xffffffff : 0x0);
346  vector.m_data[3] = (AkReal32)((in_vec1.m_data[3] == in_vec2.m_data[3]) ? 0xffffffff : 0x0);
347 
348  return vector;
349 }
350 
351 AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
352 {
353  AKSIMD_V4F32 vector;
354 
355  vector.m_data[0] = (AkReal32)(((AkUInt32)in_vec1.m_data[0]) ^ ((AkUInt32)in_vec2.m_data[0]));
356  vector.m_data[1] = (AkReal32)(((AkUInt32)in_vec1.m_data[1]) ^ ((AkUInt32)in_vec2.m_data[1]));
357  vector.m_data[2] = (AkReal32)(((AkUInt32)in_vec1.m_data[2]) ^ ((AkUInt32)in_vec2.m_data[2]));
358  vector.m_data[3] = (AkReal32)(((AkUInt32)in_vec1.m_data[3]) ^ ((AkUInt32)in_vec2.m_data[3]));
359 
360  return vector;
361 }
362 
363 AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTLEFT_V4I32( AKSIMD_V4I32 in_vector, int in_shiftBy)
364 {
365  in_vector.m_data[0] <<= in_shiftBy;
366  in_vector.m_data[1] <<= in_shiftBy;
367  in_vector.m_data[2] <<= in_shiftBy;
368  in_vector.m_data[3] <<= in_shiftBy;
369 
370  return in_vector;
371 }
372 
373 AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTRIGHT_V4I32( AKSIMD_V4I32 in_vector, int in_shiftBy)
374 {
375  in_vector.m_data[0] = (AkInt32)((AkUInt32)in_vector.m_data[0] >> in_shiftBy);
376  in_vector.m_data[1] = (AkInt32)((AkUInt32)in_vector.m_data[1] >> in_shiftBy);
377  in_vector.m_data[2] = (AkInt32)((AkUInt32)in_vector.m_data[2] >> in_shiftBy);
378  in_vector.m_data[3] = (AkInt32)((AkUInt32)in_vector.m_data[3] >> in_shiftBy);
379 
380  return in_vector;
381 }
382 
383 AkForceInline AKSIMD_V4I32 AKSIMD_SHIFTRIGHTARITH_V4I32( AKSIMD_V4I32 in_vector, int in_shiftBy)
384 {
385  in_vector.m_data[0] >>= in_shiftBy;
386  in_vector.m_data[1] >>= in_shiftBy;
387  in_vector.m_data[2] >>= in_shiftBy;
388  in_vector.m_data[3] >>= in_shiftBy;
389 
390  return in_vector;
391 }
392 
393 //@}
394 ////////////////////////////////////////////////////////////////////////
395 
396 
397 ////////////////////////////////////////////////////////////////////////
398 /// @name AKSIMD arithmetic
399 //@{
400 // _mm_sub_ps
401 AkForceInline AKSIMD_V4F32 AKSIMD_SUB_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
402 {
403  AKSIMD_V4F32 vector;
404 
405  vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
406  vector.m_data[1] = in_vec1.m_data[1] - in_vec2.m_data[1];
407  vector.m_data[2] = in_vec1.m_data[2] - in_vec2.m_data[2];
408  vector.m_data[3] = in_vec1.m_data[3] - in_vec2.m_data[3];
409 
410  return vector;
411 }
412 
413 /// Subtracts the lower single-precision, floating-point values of a and b.
414 /// The upper three single-precision, floating-point values are passed through from a.
415 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
416 
417 AkForceInline AKSIMD_V4F32 AKSIMD_SUB_SS_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
418 {
419  AKSIMD_V4F32 vector;
420 
421  vector.m_data[0] = in_vec1.m_data[0] - in_vec2.m_data[0];
422  vector.m_data[1] = in_vec1.m_data[1];
423  vector.m_data[2] = in_vec1.m_data[2];
424  vector.m_data[3] = in_vec1.m_data[3];
425 
426  return vector;
427 }
428 
429 // _mm_add_ps
430 AkForceInline AKSIMD_V4F32 AKSIMD_ADD_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
431 {
432  AKSIMD_V4F32 vector;
433 
434  vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
435  vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
436  vector.m_data[2] = in_vec1.m_data[2] + in_vec2.m_data[2];
437  vector.m_data[3] = in_vec1.m_data[3] + in_vec2.m_data[3];
438 
439  return vector;
440 }
441 
442 AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
443 {
444  AKSIMD_V4F32 vector;
445 
446  vector.m_data[0] = in_vec1.m_data[0] / in_vec2.m_data[0];
447  vector.m_data[1] = in_vec1.m_data[1] / in_vec2.m_data[1];
448  vector.m_data[2] = in_vec1.m_data[2] / in_vec2.m_data[2];
449  vector.m_data[3] = in_vec1.m_data[3] / in_vec2.m_data[3];
450 
451  return vector;
452 }
453 
454 AkForceInline AKSIMD_V2F32 AKSIMD_ADD_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
455 {
456  AKSIMD_V2F32 vector;
457 
458  vector.m_data[0] = in_vec1.m_data[0] + in_vec2.m_data[0];
459  vector.m_data[1] = in_vec1.m_data[1] + in_vec2.m_data[1];
460 
461  return vector;
462 }
463 
464 /// Adds the lower single-precision, floating-point values of a and b; the
465 /// upper three single-precision, floating-point values are passed through from a.
466 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
467 AkForceInline AKSIMD_V4F32 AKSIMD_ADD_SS_V4F32( const AKSIMD_V4F32& a, const AKSIMD_V4F32& b )
468 {
469  AKSIMD_V4F32 vector;
470 
471  vector.m_data[0] = a.m_data[0] + b.m_data[0];
472  vector.m_data[1] = a.m_data[1];
473  vector.m_data[2] = a.m_data[2];
474  vector.m_data[3] = a.m_data[3];
475 
476  return vector;
477 }
478 
479 // _mm_mul_ps
480 AkForceInline AKSIMD_V4F32 AKSIMD_MUL_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
481 {
482  AKSIMD_V4F32 vector;
483 
484  vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
485  vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
486  vector.m_data[2] = in_vec1.m_data[2] * in_vec2.m_data[2];
487  vector.m_data[3] = in_vec1.m_data[3] * in_vec2.m_data[3];
488 
489  return vector;
490 }
491 
492 AkForceInline AKSIMD_V2F32 AKSIMD_MUL_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
493 {
494  AKSIMD_V2F32 vector;
495 
496  vector.m_data[0] = in_vec1.m_data[0] * in_vec2.m_data[0];
497  vector.m_data[1] = in_vec1.m_data[1] * in_vec2.m_data[1];
498 
499  return vector;
500 }
501 
502 /// Multiplies the lower single-precision, floating-point values of
503 /// a and b; the upper three single-precision, floating-point values
504 /// are passed through from a.
505 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
506 AkForceInline AKSIMD_V4F32 AKSIMD_MUL_SS_V4F32( const AKSIMD_V4F32& a, const AKSIMD_V4F32& b )
507 {
508  AKSIMD_V4F32 vector;
509 
510  vector.m_data[0] = a.m_data[0] * b.m_data[0];
511  vector.m_data[1] = a.m_data[1];
512  vector.m_data[2] = a.m_data[2];
513  vector.m_data[3] = a.m_data[3];
514 
515  return vector;
516 }
517 
518 /// Vector multiply-add operation.
519 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
520 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
521 
522 /// Vector multiply-add operation.
523 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( (__a__), (__b__) ), (__c__) )
524 
525 // _mm_min_ps
526 AkForceInline AKSIMD_V4F32 AKSIMD_MIN_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
527 {
528  AKSIMD_V4F32 vector;
529 
530  vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
531  vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
532  vector.m_data[2] = AkMin(in_vec1.m_data[2], in_vec2.m_data[2]);
533  vector.m_data[3] = AkMin(in_vec1.m_data[3], in_vec2.m_data[3]);
534 
535  return vector;
536 }
537 
538 AkForceInline AKSIMD_V2F32 AKSIMD_MIN_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
539 {
540  AKSIMD_V2F32 vector;
541 
542  vector.m_data[0] = AkMin(in_vec1.m_data[0], in_vec2.m_data[0]);
543  vector.m_data[1] = AkMin(in_vec1.m_data[1], in_vec2.m_data[1]);
544 
545  return vector;
546 }
547 
548 // _mm_max_ps
549 AkForceInline AKSIMD_V4F32 AKSIMD_MAX_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
550 {
551  AKSIMD_V4F32 vector;
552 
553  vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
554  vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
555  vector.m_data[2] = AkMax(in_vec1.m_data[2], in_vec2.m_data[2]);
556  vector.m_data[3] = AkMax(in_vec1.m_data[3], in_vec2.m_data[3]);
557 
558  return vector;
559 }
560 
561 AkForceInline AKSIMD_V2F32 AKSIMD_MAX_V2F32( const AKSIMD_V2F32& in_vec1, const AKSIMD_V2F32& in_vec2 )
562 {
563  AKSIMD_V2F32 vector;
564 
565  vector.m_data[0] = AkMax(in_vec1.m_data[0], in_vec2.m_data[0]);
566  vector.m_data[1] = AkMax(in_vec1.m_data[1], in_vec2.m_data[1]);
567 
568  return vector;
569 }
570 
571 AkForceInline AKSIMD_V4F32 AKSIMD_ABS_V4F32( const AKSIMD_V4F32& in_vec1 )
572 {
573  AKSIMD_V4F32 vector;
574  vector.m_data[0] = fabsf(in_vec1.m_data[0]);
575  vector.m_data[1] = fabsf(in_vec1.m_data[1]);
576  vector.m_data[2] = fabsf(in_vec1.m_data[2]);
577  vector.m_data[3] = fabsf(in_vec1.m_data[3]);
578  return vector;
579 }
580 
581 AkForceInline AKSIMD_V4F32 AKSIMD_NEG_V4F32( const AKSIMD_V4F32& in_vec1 )
582 {
583  AKSIMD_V4F32 vector;
584  vector.m_data[0] = -in_vec1.m_data[0];
585  vector.m_data[1] = -in_vec1.m_data[1];
586  vector.m_data[2] = -in_vec1.m_data[2];
587  vector.m_data[3] = -in_vec1.m_data[3];
588  return vector;
589 }
590 
591 // _mm_sqrt_ps
592 AkForceInline AKSIMD_V4F32 AKSIMD_SQRT_V4F32( const AKSIMD_V4F32& in_vec )
593 {
594  AKSIMD_V4F32 vCompare;
595  AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
596  AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
597  AKSIMD_GETELEMENT_V4F32(vCompare,2) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,2) );
598  AKSIMD_GETELEMENT_V4F32(vCompare,3) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,3) );
599 
600  //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
601 
602  return vCompare /*res*/;
603 }
604 
605 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
606 AkForceInline AKSIMD_V4F32 AKSIMD_RSQRT_V4F32(const AKSIMD_V4F32& in_vec)
607 {
608  AKSIMD_V4F32 vCompare;
609  AKSIMD_GETELEMENT_V4F32(vCompare, 0) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 0));
610  AKSIMD_GETELEMENT_V4F32(vCompare, 1) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 1));
611  AKSIMD_GETELEMENT_V4F32(vCompare, 2) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 2));
612  AKSIMD_GETELEMENT_V4F32(vCompare, 3) = 1.f / sqrtf(AKSIMD_GETELEMENT_V4F32(in_vec, 3));
613 
614  return vCompare;
615 }
616 
617 AkForceInline AKSIMD_V2F32 AKSIMD_SQRT_V2F32( const AKSIMD_V2F32& in_vec )
618 {
619  AKSIMD_V2F32 vCompare;
620  AKSIMD_GETELEMENT_V4F32(vCompare,0) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,0) );
621  AKSIMD_GETELEMENT_V4F32(vCompare,1) = sqrtf( AKSIMD_GETELEMENT_V4F32(in_vec,1) );
622 
623  //AKSIMD_V4F32 res = vrecpeq_f32( vrsqrteq_f32( in_vec ) );
624 
625  return vCompare /*res*/;
626 }
627 
628 //@}
629 ////////////////////////////////////////////////////////////////////////
630 
631 
632 ////////////////////////////////////////////////////////////////////////
633 /// @name AKSIMD packing / unpacking
634 //@{
635 
636 //
637 // _mm_unpacklo_epi16
638 // r0 := a0
639 // r1 := b0
640 // r2 := a1
641 // r3 := b1
642 // r4 := a2
643 // r5 := b2
644 // r6 := a3
645 // r7 := b3
646 AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKLO_VECTOR8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
647 {
648  AKSIMD_V4I32 vector;
649  AkInt16 *pVec1,*pVec2,*pDest;
650  pVec1 = (AkInt16*)&in_vec1;
651  pVec2 = (AkInt16*)&in_vec2;
652  pDest = (AkInt16*)&vector;
653 
654  pDest[0] = pVec1[0];
655  pDest[1] = pVec2[0];
656  pDest[2] = pVec1[1];
657  pDest[3] = pVec2[1];
658  pDest[4] = pVec1[2];
659  pDest[5] = pVec2[2];
660  pDest[6] = pVec1[3];
661  pDest[7] = pVec2[3];
662 
663  return vector;
664 }
665 
666 // _mm_unpackhi_epi16
667 AkForceInline AKSIMD_V4I32 AKSIMD_UNPACKHI_VECTOR8I16( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
668 {
669  AKSIMD_V4I32 vector;
670  AkInt16 *pVec1,*pVec2,*pDest;
671  pVec1 = (AkInt16*)&in_vec1;
672  pVec2 = (AkInt16*)&in_vec2;
673  pDest = (AkInt16*)&vector;
674 
675  pDest[0] = pVec1[4];
676  pDest[1] = pVec2[4];
677  pDest[2] = pVec1[5];
678  pDest[3] = pVec2[5];
679  pDest[4] = pVec1[6];
680  pDest[5] = pVec2[6];
681  pDest[6] = pVec1[7];
682  pDest[7] = pVec2[7];
683 
684  return vector;
685 }
686 
687 // _mm_unpacklo_ps
688 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
689 {
690  AKSIMD_V4F32 vector;
691  vector.m_data[0] = in_vec1.m_data[0];
692  vector.m_data[1] = in_vec2.m_data[0];
693  vector.m_data[2] = in_vec1.m_data[1];
694  vector.m_data[3] = in_vec2.m_data[1];
695 
696  return vector;
697 }
698 
699 // _mm_unpackhi_ps
700 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
701 {
702  AKSIMD_V4F32 vector;
703  vector.m_data[0] = in_vec1.m_data[2];
704  vector.m_data[1] = in_vec2.m_data[2];
705  vector.m_data[2] = in_vec1.m_data[3];
706  vector.m_data[3] = in_vec2.m_data[3];
707 
708  return vector;
709 }
710 
711 // _mm_packs_epi32
712 AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
713 {
714  AKSIMD_V4I32 vector;
715  AkInt16 *pDest = (AkInt16*)&vector;
716 
717  pDest[0] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[0], -32768, 32767);
718  pDest[1] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[1], -32768, 32767);
719  pDest[2] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[2], -32768, 32767);
720  pDest[3] = (AkInt16)AkClamp((AkInt16)in_vec1.m_data[3], -32768, 32767);
721  pDest[4] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[0], -32768, 32767);
722  pDest[5] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[1], -32768, 32767);
723  pDest[6] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[2], -32768, 32767);
724  pDest[7] = (AkInt16)AkClamp((AkInt16)in_vec2.m_data[3], -32768, 32767);
725 
726  return vector;
727 }
728 
729 //@}
730 ////////////////////////////////////////////////////////////////////////
731 
732 
733 //#define AKSIMD_GET_ITEM( vec, index ) vec[index]
734 
735 
736 
737 
738 ////////////////////////////////////////////////////////////////////////
739 /// @name AKSIMD shuffling
740 //@{
741 
742 // See _MM_SHUFFLE
743 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
744  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
745 
746 // See _mm_shuffle_ps
747 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
748 //#define AKSIMD_SHUFFLE_V4F32( a, b, zyxw )
749 
750  AkForceInline AKSIMD_V4F32 AKSIMD_SHUFFLE_V4F32( const AKSIMD_V4F32& xyzw, const AKSIMD_V4F32& abcd, int mask )
751 {
752  AKSIMD_V4F32 vector;
753  vector.m_data[0] = xyzw.m_data[(mask) & 0x3];
754  vector.m_data[1] = xyzw.m_data[(mask >> 2) & 0x3];
755  vector.m_data[2] = abcd.m_data[(mask >> 4) & 0x3];
756  vector.m_data[3] = abcd.m_data[(mask >> 6) & 0x3];
757 
758  return vector;
759 }
760 
761 
762 /// Moves the upper two single-precision, floating-point values of b to
763 /// the lower two single-precision, floating-point values of the result.
764 /// The upper two single-precision, floating-point values of a are passed
765 /// through to the result.
766 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
767 #define AKSIMD_MOVEHL_V4F32( a, b ) \
768  AKSIMD_SHUFFLE_V4F32( (b), (a), AKSIMD_SHUFFLE(3, 2, 3, 2) )
769 
770 /// Moves the lower two single-precision, floating-point values of b to
771 /// the upper two single-precision, floating-point values of the result.
772 /// The lower two single-precision, floating-point values of a are passed
773 /// through to the result.
774 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
775 #define AKSIMD_MOVELH_V4F32( a, b ) \
776  AKSIMD_SHUFFLE_V4F32( (a), (b), AKSIMD_SHUFFLE(1, 0, 1, 0) )
777 
778 /// Swap the 2 lower floats together and the 2 higher floats together.
779 #define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1));
780 
781 /// Swap the 2 lower floats with the 2 higher floats.
782 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2));
783 
784 /// Barrel-shift all floats by one.
785 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
786 
787  /// Duplicates the odd items into the even items (d c b a -> d d b b )
788 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
789 
790  /// Duplicates the even items into the odd items (d c b a -> c c a a )
791 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
792 
793 
794 //#include <AK/SoundEngine/Platforms/Generic/AkSimdShuffle.h>
795 
796 //@}
797 ////////////////////////////////////////////////////////////////////////
798 
799 // Old AKSIMD -- will search-and-replace later
800 #define AkReal32Vector AKSIMD_V4F32
801 #define AKSIMD_LOAD1( __scalar__ ) AKSIMD_LOAD1_V4F32( &__scalar__ )
802 #define AKSIMD_LOADVEC(v) AKSIMD_LOAD_V4F32((const AKSIMD_F32*)((v)))
803 #define AKSIMD_MUL AKSIMD_MUL_V4F32
804 #define AKSIMD_STOREVEC AKSIMD_STORE_V4F32
805 
806 /// Faked in-place vector horizontal add.
807 /// \akwarning
808 /// Don't expect this to be very efficient.
809 /// \endakwarning
810 static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32( AKSIMD_V4F32 vVec )
811 {
812  AKSIMD_V4F32 vAb = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0xB1);
813  AKSIMD_V4F32 vHaddAb = AKSIMD_ADD_V4F32(vVec, vAb);
814  AKSIMD_V4F32 vHaddCd = AKSIMD_SHUFFLE_V4F32(vHaddAb, vHaddAb, 0x4E);
815  AKSIMD_V4F32 vHaddAbcd = AKSIMD_ADD_V4F32(vHaddAb, vHaddCd);
816  return vHaddAbcd;
817 }
818 
819 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
820 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
821 {
822  static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f };
823 
824  AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
825  vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
826  AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
827  vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
828  vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
829  vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 );
830  vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
831  return vTmp2;
832 }
833 
834 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
835 
836 #define AK_SIGN_BIT( val ) (((AkUInt32)val) >> 31)
837 
838 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec )
839 {
840  return AK_SIGN_BIT(in_vec.m_data[0]) | AK_SIGN_BIT(in_vec.m_data[1]) << 1 | AK_SIGN_BIT(in_vec.m_data[2]) << 2 | AK_SIGN_BIT(in_vec.m_data[3]) << 3;
841 }
842 
843 static AkForceInline AKSIMD_V4F32 AKSIMD_RECIP_V4F32(const AKSIMD_V4F32 &v)
844 {
845  AKSIMD_V4F32 r;
846  r.m_data[0] = 1.f / v.m_data[0];
847  r.m_data[1] = 1.f / v.m_data[1];
848  r.m_data[2] = 1.f / v.m_data[2];
849  r.m_data[3] = 1.f / v.m_data[3];
850  return r;
851 }
852 
853 static AkForceInline AKSIMD_V4F32 AKSIMD_CEIL_V4F32(const AKSIMD_V4F32 & x)
854 {
855  AKSIMD_V4F32 r;
856  r.m_data[0] = ceil(x.m_data[0]);
857  r.m_data[1] = ceil(x.m_data[1]);
858  r.m_data[2] = ceil(x.m_data[2]);
859  r.m_data[3] = ceil(x.m_data[3]);
860  return r;
861 }
862 
863 #endif //_AKSIMD_GENERIC_H_
864 
AkReal32 m_data[4]
Definition: AkSimd.h:49
AkReal32 m_data[2]
Definition: AkSimd.h:48
AkUInt32 m_data[4]
Definition: AkSimd.h:46
AkInt32 m_data[4]
Definition: AkSimd.h:45