Version
menu_open
link
Wwise SDK 2022.1.12
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Copyright (c) 2024 Audiokinetic Inc.
25 *******************************************************************************/
26 
27 // AkSimd.h
28 
29 /// \file
30 /// AKSIMD - arm_neon implementation
31 
32 #ifndef _AKSIMD_ARM_NEON_H_
33 #define _AKSIMD_ARM_NEON_H_
34 
35 #if defined _MSC_VER && defined _M_ARM64
36  #include <arm64_neon.h>
37 #else
38  #include <arm_neon.h>
39 #endif
41 
42 // Platform specific defines for prefetching
43 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
44 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
45 #if defined __clang__ || defined __GNUC__
46 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) __builtin_prefetch(((char *)(__add__))+(__offset__))
47 #else
48 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ )
49 #endif
50 
51 ////////////////////////////////////////////////////////////////////////
52 /// @name Platform specific memory size alignment for allocation purposes
53 //@{
54 
55 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
56 
57 //@}
58 ////////////////////////////////////////////////////////////////////////
59 
60 ////////////////////////////////////////////////////////////////////////
61 /// @name AKSIMD types
62 //@{
63 
64 typedef int32x4_t AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
65 typedef int16x8_t AKSIMD_V8I16; ///< Vector of 8 16-bit signed integers
66 typedef int16x4_t AKSIMD_V4I16; ///< Vector of 4 16-bit signed integers
67 typedef uint32x4_t AKSIMD_V4UI32; ///< Vector of 4 32-bit unsigned signed integers
68 typedef uint32x2_t AKSIMD_V2UI32; ///< Vector of 2 32-bit unsigned signed integers
69 typedef int32x2_t AKSIMD_V2I32; ///< Vector of 2 32-bit signed integers
70 typedef float32_t AKSIMD_F32; ///< 32-bit float
71 typedef float32x2_t AKSIMD_V2F32; ///< Vector of 2 32-bit floats
72 typedef float32x4_t AKSIMD_V4F32; ///< Vector of 4 32-bit floats
73 
74 typedef uint32x4_t AKSIMD_V4COND; ///< Vector of 4 comparison results
75 typedef uint32x4_t AKSIMD_V4ICOND; ///< Vector of 4 comparison results
76 typedef uint32x4_t AKSIMD_V4FCOND; ///< Vector of 4 comparison results
77 
78 #if defined(AK_CPU_ARM_NEON)
79 typedef float32x2x2_t AKSIMD_V2F32X2;
80 typedef float32x4x2_t AKSIMD_V4F32X2;
81 typedef float32x4x4_t AKSIMD_V4F32X4;
82 
83 typedef int32x4x2_t AKSIMD_V4I32X2;
84 typedef int32x4x4_t AKSIMD_V4I32X4;
85 #endif
86 
87 //@}
88 ////////////////////////////////////////////////////////////////////////
89 
90 ////////////////////////////////////////////////////////////////////////
91 /// @name AKSIMD loading / setting
92 //@{
93 
94 /// Loads four single-precision, floating-point values.
95 /// The address has no alignment requirement, (see _mm_loadu_ps).
96 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
97 
98 /// Loads four single-precision, floating-point values.
99 /// The address has no alignment requirement, (see _mm_loadu_ps).
100 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
101 
102 /// Loads a single single-precision, floating-point value, copying it into
103 /// all four words (see _mm_load1_ps, _mm_load_ps1)
104 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
105 
106 /// Sets the four single-precision, floating-point values to __scalar__ (see
107 /// _mm_set1_ps, _mm_set_ps1)
108 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
109 
110 /// Populates the full vector with the 4 floating point elements provided
111 static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a) {
112  float32x4_t ret = vdupq_n_f32(0);
113  ret = vsetq_lane_f32(d, ret, 3);
114  ret = vsetq_lane_f32(c, ret, 2);
115  ret = vsetq_lane_f32(b, ret, 1);
116  ret = vsetq_lane_f32(a, ret, 0);
117  return ret;
118 }
119 
120 /// Sets the four integer values to __scalar__
121 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
122 
123 static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a) {
124  int32x4_t ret = vdupq_n_s32(0);
125  ret = vsetq_lane_s32(d, ret, 3);
126  ret = vsetq_lane_s32(c, ret, 2);
127  ret = vsetq_lane_s32(b, ret, 1);
128  ret = vsetq_lane_s32(a, ret, 0);
129  return ret;
130 }
131 
132 static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a) {
133  // On 32b ARM, dealing with an int64_t could invoke loading in memory directly,
134  // e.g. dereferencing a 64b ptr as one of the inputs
135  // ultimately resulting in a potentially unaligned 64b load.
136  // By reinterpreting and using the 64b inputs as 32b inputs, even a load from
137  // a pointer will not have any alignment requirements
138  // ARM64 can handle dereferencing ptrs to 64b values directly safely, though
139 #if defined AK_CPU_ARM_64
140  int64x2_t ret = vdupq_n_s64(0);
141  ret = vsetq_lane_s64(b, ret, 1);
142  ret = vsetq_lane_s64(a, ret, 0);
143  return vreinterpretq_s32_s64(ret);
144 #else
145  int32x4_t ret = vdupq_n_s32(0);
146  ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
147  ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
148  ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
149  ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
150  return ret;
151 #endif
152 }
153 
155 {
156 #if defined AK_CPU_ARM_64
157  float64x2_t ret = (float64x2_t)vdupq_n_s64(0);
158  ret = vsetq_lane_f64(b, ret, 1);
159  ret = vsetq_lane_f64(a, ret, 0);
160  return (float32x4_t)(ret);
161 #else
162  int64_t a64 = *(int64_t*)&a;
163  int64_t b64 = *(int64_t*)&b;
164  int32x4_t ret = vdupq_n_s32(0);
165  ret = vsetq_lane_s32(int32_t((b64 >> 32) & 0xFFFFFFFF), ret, 3);
166  ret = vsetq_lane_s32(int32_t((b64 >> 0) & 0xFFFFFFFF), ret, 2);
167  ret = vsetq_lane_s32(int32_t((a64 >> 32) & 0xFFFFFFFF), ret, 1);
168  ret = vsetq_lane_s32(int32_t((a64 >> 0) & 0xFFFFFFFF), ret, 0);
169  return vreinterpretq_f32_s32(ret);
170 #endif
171 }
172 
173 /// Sets the four single-precision, floating-point values to zero (see
174 /// _mm_setzero_ps)
175 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
176 
177 /// Loads a single-precision, floating-point value into the low word
178 /// and clears the upper three words.
179 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
180 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
181 
182 /// Loads four 32-bit signed integer values (aligned)
183 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
184 
185 /// Loads 8 16-bit signed integer values (aligned)
186 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
187 
188 /// Loads 4 16-bit signed integer values (aligned)
189 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
190 
191 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
192 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
193 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
194 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
195 
196 /// Loads two single-precision, floating-point values
197 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
198 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
199 
200 /// Sets the two single-precision, floating-point values to __scalar__
201 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
202 
203 /// Sets the two single-precision, floating-point values to zero
204 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
205 
206 /// Loads data from memory and de-interleaves
207 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
208 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
209 
210 /// Loads data from memory and de-interleaves; only selected lane
211 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
212 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
213 
214 //@}
215 ////////////////////////////////////////////////////////////////////////
216 
217 
218 ////////////////////////////////////////////////////////////////////////
219 /// @name AKSIMD storing
220 //@{
221 
222 /// Stores four single-precision, floating-point values.
223 /// The address has no alignment requirement, (see _mm_storeu_ps).
224 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
225 
226 /// Stores four single-precision, floating-point values.
227 /// The address has no alignment requirement, (see _mm_storeu_ps).
228 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
229 
230 /// Stores the lower single-precision, floating-point value.
231 /// *p := a0 (see _mm_store_ss)
232 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
233 
234 /// Stores four 32-bit integer values. The address must be 16-byte aligned.
235 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
236 
237 /// Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
238 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
239 
240 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
241 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
242 
243 /// Stores two single-precision, floating-point values. The address must be 16-byte aligned.
244 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
245 
246 /// Stores data by interleaving into memory
247 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
248 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
249 
250 /// Stores the lower double-precision, floating-point value.
251 /// *p := a0 (see _mm_store_sd)
252 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) vst1q_lane_f64( (float64_t*)(__addr__), vreinterpretq_f64_f32(__vec__), 0 )
253 
254 //@}
255 ////////////////////////////////////////////////////////////////////////
256 
257 
258 ////////////////////////////////////////////////////////////////////////
259 /// @name AKSIMD conversion
260 //@{
261 
262 /// Converts the four signed 32-bit integer values of a to single-precision,
263 /// floating-point values (see _mm_cvtepi32_ps)
264 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
265 
266 /// Converts the four single-precision, floating-point values of a to signed
267 /// 32-bit integer values (see _mm_cvtps_epi32)
269 #if defined AK_CPU_ARM_64
270  return vcvtaq_s32_f32(a);
271 #else
272  // on ARMv7, need to add 0.5 (away from zero) and truncate that
273  float32x4_t halfPos = vdupq_n_f32(0.5f);
274  float32x4_t halfNeg = vdupq_n_f32(-0.5f);
275  float32x4_t zero = vdupq_n_f32(0.0f);
276  const uint32x4_t signMask = vcgtq_f32(a, zero);
277  const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
278  const float32x4_t aOffset = vaddq_f32(a, signedHalf);
279  return vcvtq_s32_f32(aOffset);
280 #endif
281 }
282 
283 /// Converts the four single-precision, floating-point values of a to signed
284 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
285 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
286 
287 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
288 /// vector to 4 full-precision floats
289 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
290 
291 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
292 /// vector to 4 full-precision floats
293 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
294 
295 
297 {
298 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
299 
300  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
301  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
302  float32x4_t ret;
303  __asm__("fcvtl %0.4s, %1.4h" \
304  : "=w"(ret) \
305  : "w"(vecs16)
306  );
307  return ret;
308 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
309  float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
310  float32x4_t ret = vcvt_f32_f16(vecf16);
311  uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
312  ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
313  return ret;
314 #elif defined(AK_CPU_ARM_64)
315  return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
316 #else
317  uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
318  uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
319  uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
320 
321  // Determine if value is denorm or not, and use that to determine how much exponent should be scaled by, and if we need post-scale fp adjustment
322  uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
323  uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
324  uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
325 
326  // Apply the exponent increment and adjust
327  uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
328  uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
329 
330  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' inf-nan
331  uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
332  uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
333  uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
334 
335  // reincorporate the sign
336  uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
337  float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
338  return assembledFloat;
339 #endif
340 }
341 
342 
343 /// Converts the 4 full-precision floats vector to 4 half-precision floats
344 /// occupying the lower bits and leaving the upper bits as zero
346 {
347 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
348  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
349  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
350  int32x4_t ret;
351  __asm__("fcvtn %1.4h, %1.4s\n" \
352  "\tmov %0.8b, %1.8b" \
353  : "=w"(ret) \
354  : "w"(vec)
355  );
356  return ret;
357 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
358  float16x4_t ret = vcvt_f16_f32(vec);
359  uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
360  ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
361  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
362 #elif defined(AK_CPU_ARM_64)
363  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
364 #else
365  uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
366  uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
367 
368  // do the processing for values that will be denormed in float16
369  // Add 0.5 to get value within range, and rounde; then move mantissa data up
370  float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
371  uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
372 
373  // processing for values that will be normal in float16
374  uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF); // -131072 + rounding bias
375  uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
376  uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
377  uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31); // Extend Lsb so that it's -1 when set
378  uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
379  uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
380 
381  // Combine the norm and subnorm paths together
382  uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23); // smallest float32 that yields a normalized float16
383  uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
384 
385  uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
386 
387  // apply inf/nan check
388  uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000)); // test if exponent bits are zero or not
389  uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
390  uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
391  uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
392  uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
393  uint32x4_t infNanData = vorrq_u32(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
394 
395  uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData); // and combine the results
396 
397  // reincorporate the original sign
398  uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
399 
400  // store results packed in lower 64 bits, and set upper 64 to zero
401  uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
402  return vreinterpretq_s32_u16(resultZip.val[1]);
403 #endif
404 }
405 
406 //@}
407 ////////////////////////////////////////////////////////////////////////
408 
409 ////////////////////////////////////////////////////////////////////////
410 /// @name AKSIMD cast
411 //@{
412 
413 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
414 /// used for compilation and does not generate any instructions, thus it has zero latency.
415 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
416 
417 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
418 /// used for compilation and does not generate any instructions, thus it has zero latency.
419 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
420 
421 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
422 /// used for compilation and does not generate any instructions, thus it has zero latency.
423 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
424 
425 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
426 /// used for compilation and does not generate any instructions, thus it has zero latency.
427 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
428 
429 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
430 /// used for compilation and does not generate any instructions, thus it has zero latency.
431 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
432 
433 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
434 /// used for compilation and does not generate any instructions, thus it has zero latency.
435 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
436 
437 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
438 
439 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
440 
441 /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4I32.
442 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) (AKSIMD_V4COND)(__vec__)
443 
444 /// Cast vector of type AKSIMD_V4I32 to AKSIMD_V4COND.
445 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) (AKSIMD_V4COND)(__vec__)
446 
447 //@}
448 ////////////////////////////////////////////////////////////////////////
449 
450 ////////////////////////////////////////////////////////////////////////
451 /// @name AKSIMD logical operations
452 //@{
453 
454 /// Computes the bitwise AND of the 128-bit value in a and the
455 /// 128-bit value in b (see _mm_and_si128)
456 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
457 
458 /// Compares the 8 signed 16-bit integers in a and the 8 signed
459 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
460 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
461  vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
462 
463 /// Compares for less than or equal (see _mm_cmple_ps)
464 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
465 
466 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
467 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
468 
469 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
470 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
471 
472 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
473 
474 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
475 {
476  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
477  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
478  uint32x4_t res = veorq_u32(t0, t1);
479  return vreinterpretq_f32_u32(res);
480 }
481 
482 static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
483 {
484  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
485  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
486  uint32x4_t res = vorrq_u32(t0, t1);
487  return vreinterpretq_f32_u32(res);
488 }
489 
491 {
492  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
493  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
494  uint32x4_t res = vandq_u32(t0, t1);
495  return vreinterpretq_f32_u32(res);
496 }
497 
499 {
500  uint32x4_t allSet = vdupq_n_u32(~0u);
501  uint32x4_t reinterpret = vreinterpretq_u32_f32(in_vec);
502  uint32x4_t result = veorq_u32(reinterpret, allSet);
503  return vreinterpretq_f32_u32(result);
504 }
505 
506 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
507 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
508 
509 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
510 //@}
511 ////////////////////////////////////////////////////////////////////////
512 
513 
514 ////////////////////////////////////////////////////////////////////////
515 /// @name AKSIMD shifting
516 //@{
517 
518 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
519 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
520 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
521  vshlq_n_s32( (__vec__), (__shiftBy__) )
522 
523 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
524 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
525 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
526  vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
527 
528 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
529 /// bits while shifting in the sign bit (see _mm_srai_epi32)
530 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
531  vshrq_n_s32( (__vec__), (__shiftBy__) )
532 
533 //@}
534 ////////////////////////////////////////////////////////////////////////
535 
536 
537 ////////////////////////////////////////////////////////////////////////
538 /// @name AKSIMD shuffling
539 //@{
540 
541 // Macro for combining two vector of 2 elements into one vector of
542 // 4 elements.
543 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
544 
545 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
546 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
547  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
548 
549 /// Selects four specific single-precision, floating-point values from
550 /// a and b, based on the mask i (see _mm_shuffle_ps)
551 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
552 // If you get a link error, it's probably because the required
553 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
554 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
555 #if defined(__clang__)
556  #if defined(__has_builtin) && __has_builtin(__builtin_shufflevector)
557  #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
558  __builtin_shufflevector(a, b, (zyxw >> 0) & 0x3, (zyxw >> 2) & 0x3, ((zyxw >> 4) & 0x3) + 4, ((zyxw >> 6) & 0x3) + 4 )
559  #endif
560 #endif
561 
562 #ifndef AKSIMD_SHUFFLE_V4F32
563 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
564  _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
565 
566 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
567 // implemented in a separate header file to keep this one cleaner:
569 
570 #endif
571 
572 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
573 
574 /// Moves the upper two single-precision, floating-point values of b to
575 /// the lower two single-precision, floating-point values of the result.
576 /// The upper two single-precision, floating-point values of a are passed
577 /// through to the result.
578 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
579 #define AKSIMD_MOVEHL_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__b__, __a__, AKSIMD_SHUFFLE(3,2,3,2))
580 
581 /// Moves the lower two single-precision, floating-point values of b to
582 /// the upper two single-precision, floating-point values of the result.
583 /// The lower two single-precision, floating-point values of a are passed
584 /// through to the result.
585 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
586 #define AKSIMD_MOVELH_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__a__, __b__, AKSIMD_SHUFFLE(1,0,1,0))
587 
588 /// Swap the 2 lower floats together and the 2 higher floats together.
589 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
590 
591 /// Swap the 2 lower floats with the 2 higher floats.
592 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
593 
594 /// Barrel-shift all floats by one.
595 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
596 
597 /// Duplicates the odd items into the even items (d c b a -> d d b b )
598 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
599 
600 /// Duplicates the even items into the odd items (d c b a -> c c a a )
601 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
602 //@}
603 ////////////////////////////////////////////////////////////////////////
604 
605 
606 ////////////////////////////////////////////////////////////////////////
607 /// @name AKSIMD arithmetic
608 //@{
609 
610 /// Subtracts the four single-precision, floating-point values of
611 /// a and b (a - b) (see _mm_sub_ps)
612 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
613 
614 /// Subtracts the two single-precision, floating-point values of
615 /// a and b
616 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
617 
618 /// Subtracts the lower single-precision, floating-point values of a and b.
619 /// The upper three single-precision, floating-point values are passed through from a.
620 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
621 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
622  vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
623 
624 /// Adds the four single-precision, floating-point values of
625 /// a and b (see _mm_add_ps)
626 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
627 
628 /// Adds the two single-precision, floating-point values of
629 /// a and b
630 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
631 
632 /// Adds the four integers of a and b
633 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
634 
635 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
636 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
637 
638 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
639 #define AKSIMD_MULLO_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
640 
641 /// Compare the content of four single-precision, floating-point values of
642 /// a and b
643 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
644 
645 /// Compare the content of two single-precision, floating-point values of
646 /// a and b
647 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
648 
649 /// Adds the lower single-precision, floating-point values of a and b; the
650 /// upper three single-precision, floating-point values are passed through from a.
651 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
652 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
653  vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
654 
655 /// Multiplies the four single-precision, floating-point values
656 /// of a and b (see _mm_mul_ps)
657 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
658 
659 /// Multiplies the four single-precision, floating-point values of a
660 /// by the single-precision, floating-point scalar b
661 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
662 
663 /// Rough estimation of division
665 {
666  AKSIMD_V4F32 inv = vrecpeq_f32(b);
667  AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
668  inv = vmulq_f32(restep, inv);
669  return vmulq_f32(a, inv);
670 }
671 
672 /// Multiplies the two single-precision, floating-point values
673 /// of a and b
674 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
675 
676 /// Multiplies the two single-precision, floating-point values of a
677 /// by the single-precision, floating-point scalar b
678 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
679 
680 /// Multiplies the lower single-precision, floating-point values of
681 /// a and b; the upper three single-precision, floating-point values
682 /// are passed through from a.
683 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
684 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
685  vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
686 
687 /// Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where appropriate)
688 #if defined(AK_CPU_ARM_64)
689  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vfmaq_f32( (__c__), (__a__), (__b__) )
690  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) vfma_f32( (__c__), (__a__), (__b__) )
691  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vfmaq_n_f32( (__c__), (__a__), (__b__) )
692  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vfma_n_f32( (__c__), (__a__), (__b__) )
693 #else
694  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
695  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
696  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
697  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
698 #endif
699 
700 /// Not a direct translation to vfmsq_f32 because that operation does -a*b+c, not a*b-c.
701 /// Explicitly adding an additional negation tends to produce worse codegen than giving the compiler a chance to re-order things slightly
702 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
703 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
704 
705 /// Vector multiply-add operation.
707 {
708  return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
709 }
710 
711 /// Computes the minima of the four single-precision, floating-point
712 /// values of a and b (see _mm_min_ps)
713 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
714 
715 /// Computes the minima of the two single-precision, floating-point
716 /// values of a and b
717 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
718 
719 /// Computes the maximums of the four single-precision, floating-point
720 /// values of a and b (see _mm_max_ps)
721 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
722 
723 /// Computes the maximums of the two single-precision, floating-point
724 /// values of a and b
725 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
726 
727 /// Returns absolute value
728 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
729 
730 /// Changes the sign
731 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
732 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
733 
734 /// Square root (4 floats)
735 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
736 
737 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
738 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
739 
740 /// Square root (2 floats)
741 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
742 
743 /// Reciprocal of x (1/x)
744 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
745 
746 /// Faked in-place vector horizontal add.
747 /// \akwarning
748 /// Don't expect this to be very efficient.
749 /// \endakwarning
751 {
752  AKSIMD_V4F32 vAb = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0xB1);
753  AKSIMD_V4F32 vHaddAb = AKSIMD_ADD_V4F32(vVec, vAb);
754  AKSIMD_V4F32 vHaddCd = AKSIMD_SHUFFLE_V4F32(vHaddAb, vHaddAb, 0x4E);
755  AKSIMD_V4F32 vHaddAbcd = AKSIMD_ADD_V4F32(vHaddAb, vHaddCd);
756  return vHaddAbcd;
757 }
758 
759 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
761 {
762 #ifdef AKSIMD_DECLARE_V4F32
763  static const AKSIMD_DECLARE_V4F32( vSign, -0.f, 0.f, -0.f, 0.f);
764 #else
765  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
766 #endif
767 
768  float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2); // val[0] will be reals extended, val[1] will be imag
769  vC2Ext.val[1] = AKSIMD_XOR_V4F32(vC2Ext.val[1], vSign);
770  float32x4_t vC1Rev = vrev64q_f32(vCIn1);
771  float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
772  float32x4_t vFinal = AKSIMD_MADD_V4F32(vC1Rev, vC2Ext.val[1], vMul);
773  return vFinal;
774 }
775 
776 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
777 // to/from packed elements in b, and store the results in dst.
779 {
780 #ifdef AKSIMD_DECLARE_V4F32
781  static const AKSIMD_DECLARE_V4F32(vSign, -0.f, 0.f, -0.f, 0.f);
782 #else
783  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
784 #endif
785  float32x4_t vIn2SignFlip = AKSIMD_XOR_V4F32(vIn2, vSign);
786  return vaddq_f32(vIn1, vIn2SignFlip);
787 }
788 
789 //@}
790 ////////////////////////////////////////////////////////////////////////
791 
792 
793 ////////////////////////////////////////////////////////////////////////
794 /// @name AKSIMD packing / unpacking
795 //@{
796 
797 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
798 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
799 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
800 
801 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
802 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
803 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
804 
805 /// Selects and interleaves the lower two single-precision, floating-point
806 /// values from a and b (see _mm_unpacklo_ps)
808 {
809  // sce_vectormath_xayb(in_vec1, in_vec2)
810  float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
811  float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
812  float32x2x2_t xa_yb = vtrn_f32( xy, ab );
813  AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
814  return xayb;
815 }
816 
817 /// Selects and interleaves the upper two single-precision, floating-point
818 /// values from a and b (see _mm_unpackhi_ps)
820 {
821  //return sce_vectormath_zcwd( in_vec1, in_vec2 );
822  float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
823  float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
824  float32x2x2_t zc_wd = vtrn_f32( zw, cd );
825  AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
826  return zcwd;
827 }
828 
829 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
830 /// integers and saturates (see _mm_packs_epi32)
832 {
833  int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
834  int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
835  int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
836  return vreinterpretq_s32_s16( result );
837 }
838 
839 /// V1 = {a,b} => VR = {b,c}
840 /// V2 = {c,d} =>
841 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
842 
843 /// V1 = {a,b} => V1 = {a,c}
844 /// V2 = {c,d} => V2 = {b,d}
845 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
846 
847 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
848 
849 /// V1 = {a,b} => VR = {b,a}
850 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
851 
852 // Given four pointers, gathers 32-bits of data from each location,
853 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
854 // e.g. (*addr[0]) := (b a)
855 // e.g. (*addr[1]) := (d c)
856 // e.g. (*addr[2]) := (f e)
857 // e.g. (*addr[3]) := (h g)
858 // return struct has
859 // val[0] := (g e c a)
860 // val[1] := (h f d b)
862 {
863  int16x4x2_t ret16{
864  vdup_n_s16(0),
865  vdup_n_s16(0)
866  };
867 
868  ret16 = vld2_lane_s16(addr0, ret16, 0);
869  ret16 = vld2_lane_s16(addr1, ret16, 1);
870  ret16 = vld2_lane_s16(addr2, ret16, 2);
871  ret16 = vld2_lane_s16(addr3, ret16, 3);
872 
873  AKSIMD_V4I32X2 ret{
874  vmovl_s16(ret16.val[0]),
875  vmovl_s16(ret16.val[1])
876  };
877  return ret;
878 }
879 
880 // Given four pointers, gathers 64-bits of data from each location,
881 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
882 // e.g. (*addr[0]) := (d c b a)
883 // e.g. (*addr[1]) := (h g f e)
884 // e.g. (*addr[2]) := (l k j i)
885 // e.g. (*addr[3]) := (p o n m)
886 // return struct has
887 // val[0] := (m i e a)
888 // val[1] := (n j f b)
889 // val[2] := (o k g c)
890 // val[3] := (p l h d)
891 
893 {
894  int16x4x4_t ret16{
895  vdup_n_s16(0),
896  vdup_n_s16(0),
897  vdup_n_s16(0),
898  vdup_n_s16(0)
899  };
900 
901  ret16 = vld4_lane_s16(addr0, ret16, 0);
902  ret16 = vld4_lane_s16(addr1, ret16, 1);
903  ret16 = vld4_lane_s16(addr2, ret16, 2);
904  ret16 = vld4_lane_s16(addr3, ret16, 3);
905 
906  AKSIMD_V4I32X4 ret{
907  vmovl_s16(ret16.val[0]),
908  vmovl_s16(ret16.val[1]),
909  vmovl_s16(ret16.val[2]),
910  vmovl_s16(ret16.val[3])
911  };
912  return ret;
913 }
914 
915 //@}
916 ////////////////////////////////////////////////////////////////////////
917 
918 ////////////////////////////////////////////////////////////////////////
919 /// @name AKSIMD vector comparison
920 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
921 //@{
922 
923 #define AKSIMD_CMP_CTRLMASK uint32x4_t
924 
925 /// Compare each float element and return control mask.
926 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
927 
928 /// Compare each float element and return control mask.
929 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
930 
931 /// Compare each float element and return control mask.
932 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
933 
934 /// Compare each float element and return control mask.
935 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
936 
937 /// Compare each integer element and return control mask.
938 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
939 
940 /// Compare each float element and return control mask.
941 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
942 
943 /// Compare each integer element and return control mask.
944 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
945 
946 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
947 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
948 
949 // (cond1 >= cond2) ? b : a.
950 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
951 
952 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
953 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
954 
955 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
956 
957 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4UI32& in_vec1 )
958 {
959 #ifdef AKSIMD_DECLARE_V4F32
960  static const AKSIMD_DECLARE_V4I32(movemask, 1, 2, 4, 8);
961  static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
962 #else
963  static const uint32x4_t movemask = { 1, 2, 4, 8 };
964  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
965 #endif
966 
967  uint32x4_t t0 = in_vec1;
968  uint32x4_t t1 = vtstq_u32(t0, highbit);
969  uint32x4_t t2 = vandq_u32(t1, movemask);
970  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
971  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
972 }
973 
974 #ifndef AK_WIN
975 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec1 )
976 {
977  return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
978 }
979 #endif
980 
981 // returns true if every element of the provided vector is zero
983 {
984 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
985  uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
986  return maxValue == 0;
987 #else
988  int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
989  return vget_lane_s64(orReduce, 0) == 0;
990 #endif
991 }
992 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
993 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
994 
996 {
997 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
998  uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
999  return minValue == ~0;
1000 #else
1001  int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
1002  return vget_lane_s64(andReduce, 0) == ~0LL;
1003 #endif
1004 }
1005 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
1006 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))
1007 
1008 
1010 {
1011  int32x4_t temp = AKSIMD_SETV_V4I32(8, 4, 2, 1);
1012  int32x4_t xvec = AKSIMD_SET_V4I32((AkInt32)x);
1013  int32x4_t xand = AKSIMD_AND_V4I32(xvec, temp);
1014  return AKSIMD_EQ_V4I32(temp, xand);
1015 }
1016 
1017 
1018 //@}
1019 ////////////////////////////////////////////////////////////////////////
1020 
1021 #endif //_AKSIMD_ARM_NEON_H_
1022 
AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32(const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
Vector multiply-add operation.
Definition: AkSimd.h:706
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(uint16x4_t vecs16)
Definition: AkSimd.h:296
static AkForceInline AKSIMD_V4I32 AKSIMD_ROUND_V4F32_TO_V4I32(AKSIMD_V4F32 a)
Definition: AkSimd.h:268
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:72
float32x2x2_t AKSIMD_V2F32X2
Definition: AkSimd.h:79
static AkForceInline float32x4_t AKSIMD_SETV_V2F64(AkReal64 b, AkReal64 a)
Definition: AkSimd.h:154
static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a)
Populates the full vector with the 4 floating point elements provided.
Definition: AkSimd.h:111
#define AKSIMD_SET_V4I32(__scalar__)
Sets the four integer values to scalar
Definition: AkSimd.h:121
#define AKSIMD_DECLARE_V4F32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:108
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:831
static AkForceInline AKSIMD_V4F32 AKSIMD_NOT_V4F32(const AKSIMD_V4F32 &in_vec)
Definition: AkSimd.h:498
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:760
float32x4x4_t AKSIMD_V4F32X4
Definition: AkSimd.h:81
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:74
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:892
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:861
static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a)
Definition: AkSimd.h:123
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:694
static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:482
int16x4_t AKSIMD_V4I16
Vector of 4 16-bit signed integers.
Definition: AkSimd.h:66
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:563
#define AKSIMD_AND_V4I32(__a__, __b__)
Definition: AkSimd.h:456
static AkForceInline AKSIMD_V4F32 AKSIMD_AND_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:490
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:750
#define AKSIMD_MUL_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:684
int32_t AkInt32
Signed 32-bit integer.
static AkForceInline AKSIMD_V4F32 AKSIMD_ADDSUB_V4F32(AKSIMD_V4F32 vIn1, AKSIMD_V4F32 vIn2)
Definition: AkSimd.h:778
#define AKSIMD_EQ_V4I32(__a__, __b__)
Compare each integer element and return control mask.
Definition: AkSimd.h:944
int32x2_t AKSIMD_V2I32
Vector of 2 32-bit signed integers.
Definition: AkSimd.h:69
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:75
int16_t AkInt16
Signed 16-bit integer.
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:76
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:71
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:807
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:70
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:664
uint32x4_t AKSIMD_V4UI32
Vector of 4 32-bit unsigned signed integers.
Definition: AkSimd.h:67
double AkReal64
64-bit floating point
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:995
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:64
#define AKSIMD_DECLARE_V4I32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:112
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:626
uint32_t AkUInt32
Unsigned 32-bit integer.
float32x4x2_t AKSIMD_V4F32X2
Definition: AkSimd.h:80
int32x4x2_t AKSIMD_V4I32X2
Definition: AkSimd.h:83
int32x4x4_t AKSIMD_V4I32X4
Definition: AkSimd.h:84
uint32x2_t AKSIMD_V2UI32
Vector of 2 32-bit unsigned signed integers.
Definition: AkSimd.h:68
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:819
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Definition: AkSimd.h:1009
#define AkForceInline
Definition: AkTypes.h:63
static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:474
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:982
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4UI32 &in_vec1)
Definition: AkSimd.h:957
int16x8_t AKSIMD_V8I16
Vector of 8 16-bit signed integers.
Definition: AkSimd.h:65
#define AKSIMD_ADD_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:652
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:345
static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a)
Definition: AkSimd.h:132

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise