Version
Wwise SDK 2021.1.9
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: v2021.1.9 Build: 7847
25  Copyright (c) 2006-2022 Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - arm_neon implementation
32 
33 #ifndef _AKSIMD_ARM_NEON_H_
34 #define _AKSIMD_ARM_NEON_H_
35 
36 #if defined _MSC_VER && defined _M_ARM64
37  #include <arm64_neon.h>
38 #else
39  #include <arm_neon.h>
40 #endif
42 
43 // Platform specific defines for prefetching
44 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
45 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
46 #if defined __clang__ || defined __GNUC__
47 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) __builtin_prefetch(((char *)(__add__))+(__offset__))
48 #else
49 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ )
50 #endif
51 
52 ////////////////////////////////////////////////////////////////////////
53 /// @name Platform specific memory size alignment for allocation purposes
54 //@{
55 
56 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
57 
58 //@}
59 ////////////////////////////////////////////////////////////////////////
60 
61 ////////////////////////////////////////////////////////////////////////
62 /// @name AKSIMD types
63 //@{
64 
65 typedef int32x4_t AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
66 typedef int16x8_t AKSIMD_V8I16; ///< Vector of 8 16-bit signed integers
67 typedef int16x4_t AKSIMD_V4I16; ///< Vector of 4 16-bit signed integers
68 typedef uint32x4_t AKSIMD_V4UI32; ///< Vector of 4 32-bit unsigned signed integers
69 typedef uint32x2_t AKSIMD_V2UI32; ///< Vector of 2 32-bit unsigned signed integers
70 typedef int32x2_t AKSIMD_V2I32; ///< Vector of 2 32-bit signed integers
71 typedef float32_t AKSIMD_F32; ///< 32-bit float
72 typedef float32x2_t AKSIMD_V2F32; ///< Vector of 2 32-bit floats
73 typedef float32x4_t AKSIMD_V4F32; ///< Vector of 4 32-bit floats
74 
75 typedef uint32x4_t AKSIMD_V4COND; ///< Vector of 4 comparison results
76 typedef uint32x4_t AKSIMD_V4ICOND; ///< Vector of 4 comparison results
77 typedef uint32x4_t AKSIMD_V4FCOND; ///< Vector of 4 comparison results
78 
79 #if defined(AK_CPU_ARM_NEON)
80 typedef float32x2x2_t AKSIMD_V2F32X2;
81 typedef float32x4x2_t AKSIMD_V4F32X2;
82 typedef float32x4x4_t AKSIMD_V4F32X4;
83 
84 typedef int32x4x2_t AKSIMD_V4I32X2;
85 typedef int32x4x4_t AKSIMD_V4I32X4;
86 #endif
87 
88 //@}
89 ////////////////////////////////////////////////////////////////////////
90 
91 ////////////////////////////////////////////////////////////////////////
92 /// @name AKSIMD loading / setting
93 //@{
94 
95 /// Loads four single-precision, floating-point values.
96 /// The address has no alignment requirement, (see _mm_loadu_ps).
97 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
98 
99 /// Loads four single-precision, floating-point values.
100 /// The address has no alignment requirement, (see _mm_loadu_ps).
101 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
102 
103 /// Loads a single single-precision, floating-point value, copying it into
104 /// all four words (see _mm_load1_ps, _mm_load_ps1)
105 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
106 
107 /// Sets the four single-precision, floating-point values to __scalar__ (see
108 /// _mm_set1_ps, _mm_set_ps1)
109 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
110 
111 /// Populates the full vector with the 4 floating point elements provided
112 static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a) {
113  float32x4_t ret = vdupq_n_f32(0);
114  ret = vsetq_lane_f32(d, ret, 3);
115  ret = vsetq_lane_f32(c, ret, 2);
116  ret = vsetq_lane_f32(b, ret, 1);
117  ret = vsetq_lane_f32(a, ret, 0);
118  return ret;
119 }
120 
121 /// Sets the four integer values to __scalar__
122 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
123 
124 static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a) {
125  int32x4_t ret = vdupq_n_s32(0);
126  ret = vsetq_lane_s32(d, ret, 3);
127  ret = vsetq_lane_s32(c, ret, 2);
128  ret = vsetq_lane_s32(b, ret, 1);
129  ret = vsetq_lane_s32(a, ret, 0);
130  return ret;
131 }
132 
133 static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a) {
134  // On 32b ARM, dealing with an int64_t could invoke loading in memory directly,
135  // e.g. dereferencing a 64b ptr as one of the inputs
136  // ultimately resulting in a potentially unaligned 64b load.
137  // By reinterpreting and using the 64b inputs as 32b inputs, even a load from
138  // a pointer will not have any alignment requirements
139  // ARM64 can handle dereferencing ptrs to 64b values directly safely, though
140 #if defined AK_CPU_ARM_64
141  int64x2_t ret = vdupq_n_s64(0);
142  ret = vsetq_lane_s64(b, ret, 1);
143  ret = vsetq_lane_s64(a, ret, 0);
144  return vreinterpretq_s32_s64(ret);
145 #else
146  int32x4_t ret = vdupq_n_s32(0);
147  ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
148  ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
149  ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
150  ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
151  return ret;
152 #endif
153 }
154 
156 {
157 #if defined AK_CPU_ARM_64
158  float64x2_t ret = (float64x2_t)vdupq_n_s64(0);
159  ret = vsetq_lane_f64(b, ret, 1);
160  ret = vsetq_lane_f64(a, ret, 0);
161  return (float32x4_t)(ret);
162 #else
163  int64_t a64 = *(int64_t*)&a;
164  int64_t b64 = *(int64_t*)&b;
165  int32x4_t ret = vdupq_n_s32(0);
166  ret = vsetq_lane_s32(int32_t((b64 >> 32) & 0xFFFFFFFF), ret, 3);
167  ret = vsetq_lane_s32(int32_t((b64 >> 0) & 0xFFFFFFFF), ret, 2);
168  ret = vsetq_lane_s32(int32_t((a64 >> 32) & 0xFFFFFFFF), ret, 1);
169  ret = vsetq_lane_s32(int32_t((a64 >> 0) & 0xFFFFFFFF), ret, 0);
170  return vreinterpretq_f32_s32(ret);
171 #endif
172 }
173 
174 /// Sets the four single-precision, floating-point values to zero (see
175 /// _mm_setzero_ps)
176 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
177 
178 /// Loads a single-precision, floating-point value into the low word
179 /// and clears the upper three words.
180 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
181 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
182 
183 /// Loads four 32-bit signed integer values (aligned)
184 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
185 
186 /// Loads 8 16-bit signed integer values (aligned)
187 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
188 
189 /// Loads 4 16-bit signed integer values (aligned)
190 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
191 
192 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
193 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
194 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
195 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
196 
197 /// Loads two single-precision, floating-point values
198 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
199 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
200 
201 /// Sets the two single-precision, floating-point values to __scalar__
202 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
203 
204 /// Sets the two single-precision, floating-point values to zero
205 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
206 
207 /// Loads data from memory and de-interleaves
208 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
209 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
210 
211 /// Loads data from memory and de-interleaves; only selected lane
212 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
213 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
214 
215 //@}
216 ////////////////////////////////////////////////////////////////////////
217 
218 
219 ////////////////////////////////////////////////////////////////////////
220 /// @name AKSIMD storing
221 //@{
222 
223 /// Stores four single-precision, floating-point values.
224 /// The address has no alignment requirement, (see _mm_storeu_ps).
225 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
226 
227 /// Stores four single-precision, floating-point values.
228 /// The address has no alignment requirement, (see _mm_storeu_ps).
229 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
230 
231 /// Stores the lower single-precision, floating-point value.
232 /// *p := a0 (see _mm_store_ss)
233 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
234 
235 /// Stores four 32-bit integer values. The address must be 16-byte aligned.
236 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
237 
238 /// Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
239 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
240 
241 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
242 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
243 
244 /// Stores two single-precision, floating-point values. The address must be 16-byte aligned.
245 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
246 
247 /// Stores data by interleaving into memory
248 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
249 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
250 
251 //@}
252 ////////////////////////////////////////////////////////////////////////
253 
254 
255 ////////////////////////////////////////////////////////////////////////
256 /// @name AKSIMD conversion
257 //@{
258 
259 /// Converts the four signed 32-bit integer values of a to single-precision,
260 /// floating-point values (see _mm_cvtepi32_ps)
261 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
262 
263 /// Converts the four single-precision, floating-point values of a to signed
264 /// 32-bit integer values (see _mm_cvtps_epi32)
266 #if defined AK_CPU_ARM_64
267  return vcvtaq_s32_f32(a);
268 #else
269  // on ARMv7, need to add 0.5 (away from zero) and truncate that
270  float32x4_t halfPos = vdupq_n_f32(0.5f);
271  float32x4_t halfNeg = vdupq_n_f32(-0.5f);
272  float32x4_t zero = vdupq_n_f32(0.0f);
273  const uint32x4_t signMask = vcgtq_f32(a, zero);
274  const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
275  const float32x4_t aOffset = vaddq_f32(a, signedHalf);
276  return vcvtq_s32_f32(aOffset);
277 #endif
278 }
279 
280 /// Converts the four single-precision, floating-point values of a to signed
281 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
282 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
283 
284 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
285 /// vector to 4 full-precision floats
286 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
287 
288 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
289 /// vector to 4 full-precision floats
290 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
291 
292 
294 {
295 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
296 
297  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
298  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
299  float32x4_t ret;
300  __asm__("fcvtl %0.4s, %1.4h" \
301  : "=w"(ret) \
302  : "w"(vecs16)
303  );
304  return ret;
305 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
306  float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
307  float32x4_t ret = vcvt_f32_f16(vecf16);
308  uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
309  ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
310  return ret;
311 #elif defined(AK_CPU_ARM_64)
312  return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
313 #else
314  uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
315  uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
316  uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
317 
318  // Determine if value is denorm or not, and use that to determine how much exponent should be scaled by, and if we need post-scale fp adjustment
319  uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
320  uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
321  uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
322 
323  // Apply the exponent increment and adjust
324  uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
325  uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
326 
327  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' inf-nan
328  uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
329  uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
330  uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
331 
332  // reincorporate the sign
333  uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
334  float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
335  return assembledFloat;
336 #endif
337 }
338 
339 
340 /// Converts the 4 full-precision floats vector to 4 half-precision floats
341 /// occupying the lower bits and leaving the upper bits as zero
343 {
344 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
345  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
346  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
347  int32x4_t ret;
348  __asm__("fcvtn %1.4h, %1.4s\n" \
349  "\tmov %0.8b, %1.8b" \
350  : "=w"(ret) \
351  : "w"(vec)
352  );
353  return ret;
354 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
355  float16x4_t ret = vcvt_f16_f32(vec);
356  uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
357  ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
358  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
359 #elif defined(AK_CPU_ARM_64)
360  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
361 #else
362  uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
363  uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
364 
365  // do the processing for values that will be denormed in float16
366  // Add 0.5 to get value within range, and rounde; then move mantissa data up
367  float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
368  uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
369 
370  // processing for values that will be normal in float16
371  uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF); // -131072 + rounding bias
372  uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
373  uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
374  uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31); // Extend Lsb so that it's -1 when set
375  uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
376  uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
377 
378  // Combine the norm and subnorm paths together
379  uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23); // smallest float32 that yields a normalized float16
380  uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
381 
382  uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
383 
384  // apply inf/nan check
385  uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000)); // test if exponent bits are zero or not
386  uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
387  uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
388  uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
389  uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
390  uint32x4_t infNanData = vorrq_u32(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
391 
392  uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData); // and combine the results
393 
394  // reincorporate the original sign
395  uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
396 
397  // store results packed in lower 64 bits, and set upper 64 to zero
398  uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
399  return vreinterpretq_s32_u16(resultZip.val[1]);
400 #endif
401 }
402 
403 //@}
404 ////////////////////////////////////////////////////////////////////////
405 
406 ////////////////////////////////////////////////////////////////////////
407 /// @name AKSIMD cast
408 //@{
409 
410 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
411 /// used for compilation and does not generate any instructions, thus it has zero latency.
412 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
413 
414 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
415 /// used for compilation and does not generate any instructions, thus it has zero latency.
416 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
417 
418 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
419 /// used for compilation and does not generate any instructions, thus it has zero latency.
420 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
421 
422 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
423 /// used for compilation and does not generate any instructions, thus it has zero latency.
424 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
425 
426 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
427 /// used for compilation and does not generate any instructions, thus it has zero latency.
428 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
429 
430 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
431 /// used for compilation and does not generate any instructions, thus it has zero latency.
432 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
433 
434 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
435 
436 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
437 //@}
438 ////////////////////////////////////////////////////////////////////////
439 
440 ////////////////////////////////////////////////////////////////////////
441 /// @name AKSIMD logical operations
442 //@{
443 
444 /// Computes the bitwise AND of the 128-bit value in a and the
445 /// 128-bit value in b (see _mm_and_si128)
446 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
447 
448 /// Compares the 8 signed 16-bit integers in a and the 8 signed
449 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
450 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
451  vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
452 
453 /// Compares for less than or equal (see _mm_cmple_ps)
454 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
455 
456 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
457 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
458 
459 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
460 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
461 
462 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
463 
464 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
465 {
466  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
467  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
468  uint32x4_t res = veorq_u32(t0, t1);
469  return vreinterpretq_f32_u32(res);
470 }
471 
472 static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
473 {
474  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
475  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
476  uint32x4_t res = vorrq_u32(t0, t1);
477  return vreinterpretq_f32_u32(res);
478 }
479 
481 {
482  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
483  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
484  uint32x4_t res = vandq_u32(t0, t1);
485  return vreinterpretq_f32_u32(res);
486 }
487 
489 {
490  uint32x4_t allSet = vdupq_n_u32(~0u);
491  uint32x4_t reinterpret = vreinterpretq_u32_f32(in_vec);
492  uint32x4_t result = veorq_u32(reinterpret, allSet);
493  return vreinterpretq_f32_u32(result);
494 }
495 
496 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
497 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
498 
499 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
500 //@}
501 ////////////////////////////////////////////////////////////////////////
502 
503 
504 ////////////////////////////////////////////////////////////////////////
505 /// @name AKSIMD shifting
506 //@{
507 
508 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
509 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
510 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
511  vshlq_n_s32( (__vec__), (__shiftBy__) )
512 
513 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
514 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
515 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
516  vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
517 
518 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
519 /// bits while shifting in the sign bit (see _mm_srai_epi32)
520 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
521  vshrq_n_s32( (__vec__), (__shiftBy__) )
522 
523 //@}
524 ////////////////////////////////////////////////////////////////////////
525 
526 
527 ////////////////////////////////////////////////////////////////////////
528 /// @name AKSIMD shuffling
529 //@{
530 
531 // Macro for combining two vector of 2 elements into one vector of
532 // 4 elements.
533 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
534 
535 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
536 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
537  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
538 
539 /// Selects four specific single-precision, floating-point values from
540 /// a and b, based on the mask i (see _mm_shuffle_ps)
541 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
542 // If you get a link error, it's probably because the required
543 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
544 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
545 #if defined(__clang__) && defined(__has_builtin) && __has_builtin(__builtin_shufflevector)
546 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
547  __builtin_shufflevector(a, b, (zyxw >> 0) & 0x3, (zyxw >> 2) & 0x3, ((zyxw >> 4) & 0x3) + 4, ((zyxw >> 6) & 0x3) + 4 )
548 #else
549 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
550  _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
551 
552 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
553 // implemented in a separate header file to keep this one cleaner:
555 
556 #endif
557 
558 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
559 
560 /// Moves the upper two single-precision, floating-point values of b to
561 /// the lower two single-precision, floating-point values of the result.
562 /// The upper two single-precision, floating-point values of a are passed
563 /// through to the result.
564 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
565 #define AKSIMD_MOVEHL_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__b__, __a__, AKSIMD_SHUFFLE(3,2,3,2))
566 
567 /// Moves the lower two single-precision, floating-point values of b to
568 /// the upper two single-precision, floating-point values of the result.
569 /// The lower two single-precision, floating-point values of a are passed
570 /// through to the result.
571 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
572 #define AKSIMD_MOVELH_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__a__, __b__, AKSIMD_SHUFFLE(1,0,1,0))
573 
574 /// Swap the 2 lower floats together and the 2 higher floats together.
575 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
576 
577 /// Swap the 2 lower floats with the 2 higher floats.
578 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
579 
580 /// Barrel-shift all floats by one.
581 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
582 
583 /// Duplicates the odd items into the even items (d c b a -> d d b b )
584 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
585 
586 /// Duplicates the even items into the odd items (d c b a -> c c a a )
587 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
588 //@}
589 ////////////////////////////////////////////////////////////////////////
590 
591 
592 ////////////////////////////////////////////////////////////////////////
593 /// @name AKSIMD arithmetic
594 //@{
595 
596 /// Subtracts the four single-precision, floating-point values of
597 /// a and b (a - b) (see _mm_sub_ps)
598 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
599 
600 /// Subtracts the two single-precision, floating-point values of
601 /// a and b
602 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
603 
604 /// Subtracts the lower single-precision, floating-point values of a and b.
605 /// The upper three single-precision, floating-point values are passed through from a.
606 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
607 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
608  vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
609 
610 /// Adds the four single-precision, floating-point values of
611 /// a and b (see _mm_add_ps)
612 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
613 
614 /// Adds the two single-precision, floating-point values of
615 /// a and b
616 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
617 
618 /// Adds the four integers of a and b
619 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
620 
621 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
622 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
623 
624 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
625 #define AKSIMD_MULLO_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
626 
627 /// Compare the content of four single-precision, floating-point values of
628 /// a and b
629 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
630 
631 /// Compare the content of two single-precision, floating-point values of
632 /// a and b
633 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
634 
635 /// Adds the lower single-precision, floating-point values of a and b; the
636 /// upper three single-precision, floating-point values are passed through from a.
637 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
638 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
639  vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
640 
641 /// Multiplies the four single-precision, floating-point values
642 /// of a and b (see _mm_mul_ps)
643 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
644 
645 /// Multiplies the four single-precision, floating-point values of a
646 /// by the single-precision, floating-point scalar b
647 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
648 
649 /// Rough estimation of division
651 {
652  AKSIMD_V4F32 inv = vrecpeq_f32(b);
653  AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
654  inv = vmulq_f32(restep, inv);
655  return vmulq_f32(a, inv);
656 }
657 
658 /// Multiplies the two single-precision, floating-point values
659 /// of a and b
660 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
661 
662 /// Multiplies the two single-precision, floating-point values of a
663 /// by the single-precision, floating-point scalar b
664 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
665 
666 /// Multiplies the lower single-precision, floating-point values of
667 /// a and b; the upper three single-precision, floating-point values
668 /// are passed through from a.
669 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
670 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
671  vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
672 
673 /// Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where appropriate)
674 #if defined(AK_CPU_ARM_64)
675  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vfmaq_f32( (__c__), (__a__), (__b__) )
676  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) vfma_f32( (__c__), (__a__), (__b__) )
677  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vfmaq_n_f32( (__c__), (__a__), (__b__) )
678  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vfma_n_f32( (__c__), (__a__), (__b__) )
679 #else
680  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
681  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
682  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
683  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
684 #endif
685 
686 /// Not a direct translation to vfmsq_f32 because that operation does -a*b+c, not a*b-c.
687 /// Explicitly adding an additional negation tends to produce worse codegen than giving the compiler a chance to re-order things slightly
688 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
689 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
690 
691 /// Vector multiply-add operation.
693 {
694  return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
695 }
696 
697 /// Computes the minima of the four single-precision, floating-point
698 /// values of a and b (see _mm_min_ps)
699 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
700 
701 /// Computes the minima of the two single-precision, floating-point
702 /// values of a and b
703 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
704 
705 /// Computes the maximums of the four single-precision, floating-point
706 /// values of a and b (see _mm_max_ps)
707 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
708 
709 /// Computes the maximums of the two single-precision, floating-point
710 /// values of a and b
711 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
712 
713 /// Returns absolute value
714 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
715 
716 /// Changes the sign
717 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
718 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
719 
720 /// Square root (4 floats)
721 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
722 
723 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
724 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
725 
726 /// Square root (2 floats)
727 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
728 
729 /// Reciprocal of x (1/x)
730 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
731 
732 /// Faked in-place vector horizontal add.
733 /// \akwarning
734 /// Don't expect this to be very efficient.
735 /// \endakwarning
737 {
738  AKSIMD_V4F32 vAb = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0xB1);
739  AKSIMD_V4F32 vHaddAb = AKSIMD_ADD_V4F32(vVec, vAb);
740  AKSIMD_V4F32 vHaddCd = AKSIMD_SHUFFLE_V4F32(vHaddAb, vHaddAb, 0x4E);
741  AKSIMD_V4F32 vHaddAbcd = AKSIMD_ADD_V4F32(vHaddAb, vHaddCd);
742  return vHaddAbcd;
743 }
744 
745 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
747 {
748 #ifdef AKSIMD_DECLARE_V4F32
749  static const AKSIMD_DECLARE_V4F32( vSign, -0.f, 0.f, -0.f, 0.f);
750 #else
751  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
752 #endif
753 
754  float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2); // val[0] will be reals extended, val[1] will be imag
755  vC2Ext.val[1] = AKSIMD_XOR_V4F32(vC2Ext.val[1], vSign);
756  float32x4_t vC1Rev = vrev64q_f32(vCIn1);
757  float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
758  float32x4_t vFinal = AKSIMD_MADD_V4F32(vC1Rev, vC2Ext.val[1], vMul);
759  return vFinal;
760 }
761 
762 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
763 // to/from packed elements in b, and store the results in dst.
765 {
766 #ifdef AKSIMD_DECLARE_V4F32
767  static const AKSIMD_DECLARE_V4F32(vSign, -0.f, 0.f, -0.f, 0.f);
768 #else
769  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
770 #endif
771  float32x4_t vIn2SignFlip = AKSIMD_XOR_V4F32(vIn2, vSign);
772  return vaddq_f32(vIn1, vIn2SignFlip);
773 }
774 
775 //@}
776 ////////////////////////////////////////////////////////////////////////
777 
778 
779 ////////////////////////////////////////////////////////////////////////
780 /// @name AKSIMD packing / unpacking
781 //@{
782 
783 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
784 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
785 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
786 
787 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
788 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
789 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
790 
791 /// Selects and interleaves the lower two single-precision, floating-point
792 /// values from a and b (see _mm_unpacklo_ps)
794 {
795  // sce_vectormath_xayb(in_vec1, in_vec2)
796  float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
797  float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
798  float32x2x2_t xa_yb = vtrn_f32( xy, ab );
799  AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
800  return xayb;
801 }
802 
803 /// Selects and interleaves the upper two single-precision, floating-point
804 /// values from a and b (see _mm_unpackhi_ps)
806 {
807  //return sce_vectormath_zcwd( in_vec1, in_vec2 );
808  float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
809  float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
810  float32x2x2_t zc_wd = vtrn_f32( zw, cd );
811  AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
812  return zcwd;
813 }
814 
815 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
816 /// integers and saturates (see _mm_packs_epi32)
818 {
819  int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
820  int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
821  int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
822  return vreinterpretq_s32_s16( result );
823 }
824 
825 /// V1 = {a,b} => VR = {b,c}
826 /// V2 = {c,d} =>
827 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
828 
829 /// V1 = {a,b} => V1 = {a,c}
830 /// V2 = {c,d} => V2 = {b,d}
831 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
832 
833 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
834 
835 /// V1 = {a,b} => VR = {b,a}
836 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
837 
838 // Given four pointers, gathers 32-bits of data from each location,
839 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
840 // e.g. (*addr[0]) := (b a)
841 // e.g. (*addr[1]) := (d c)
842 // e.g. (*addr[2]) := (f e)
843 // e.g. (*addr[3]) := (h g)
844 // return struct has
845 // val[0] := (g e c a)
846 // val[1] := (h f d b)
848 {
849  int16x4x2_t ret16{
850  vdup_n_s16(0),
851  vdup_n_s16(0)
852  };
853 
854  ret16 = vld2_lane_s16(addr0, ret16, 0);
855  ret16 = vld2_lane_s16(addr1, ret16, 1);
856  ret16 = vld2_lane_s16(addr2, ret16, 2);
857  ret16 = vld2_lane_s16(addr3, ret16, 3);
858 
859  AKSIMD_V4I32X2 ret{
860  vmovl_s16(ret16.val[0]),
861  vmovl_s16(ret16.val[1])
862  };
863  return ret;
864 }
865 
866 // Given four pointers, gathers 64-bits of data from each location,
867 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
868 // e.g. (*addr[0]) := (d c b a)
869 // e.g. (*addr[1]) := (h g f e)
870 // e.g. (*addr[2]) := (l k j i)
871 // e.g. (*addr[3]) := (p o n m)
872 // return struct has
873 // val[0] := (m i e a)
874 // val[1] := (n j f b)
875 // val[2] := (o k g c)
876 // val[3] := (p l h d)
877 
879 {
880  int16x4x4_t ret16{
881  vdup_n_s16(0),
882  vdup_n_s16(0),
883  vdup_n_s16(0),
884  vdup_n_s16(0)
885  };
886 
887  ret16 = vld4_lane_s16(addr0, ret16, 0);
888  ret16 = vld4_lane_s16(addr1, ret16, 1);
889  ret16 = vld4_lane_s16(addr2, ret16, 2);
890  ret16 = vld4_lane_s16(addr3, ret16, 3);
891 
892  AKSIMD_V4I32X4 ret{
893  vmovl_s16(ret16.val[0]),
894  vmovl_s16(ret16.val[1]),
895  vmovl_s16(ret16.val[2]),
896  vmovl_s16(ret16.val[3])
897  };
898  return ret;
899 }
900 
901 //@}
902 ////////////////////////////////////////////////////////////////////////
903 
904 ////////////////////////////////////////////////////////////////////////
905 /// @name AKSIMD vector comparison
906 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
907 //@{
908 
909 #define AKSIMD_CMP_CTRLMASK uint32x4_t
910 
911 /// Compare each float element and return control mask.
912 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
913 
914 /// Compare each float element and return control mask.
915 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
916 
917 /// Compare each float element and return control mask.
918 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
919 
920 /// Compare each float element and return control mask.
921 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
922 
923 /// Compare each integer element and return control mask.
924 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
925 
926 /// Compare each float element and return control mask.
927 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
928 
929 /// Compare each integer element and return control mask.
930 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
931 
932 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
933 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
934 
935 // (cond1 >= cond2) ? b : a.
936 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
937 
938 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
939 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
940 
941 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
942 
943 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4UI32& in_vec1 )
944 {
945 #ifdef AKSIMD_DECLARE_V4F32
946  static const AKSIMD_DECLARE_V4I32(movemask, 1, 2, 4, 8);
947  static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
948 #else
949  static const uint32x4_t movemask = { 1, 2, 4, 8 };
950  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
951 #endif
952 
953  uint32x4_t t0 = in_vec1;
954  uint32x4_t t1 = vtstq_u32(t0, highbit);
955  uint32x4_t t2 = vandq_u32(t1, movemask);
956  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
957  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
958 }
959 
960 #ifndef AK_WIN
961 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec1 )
962 {
963  return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
964 }
965 #endif
966 
967 // returns true if every element of the provided vector is zero
969 {
970 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
971  uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
972  return maxValue == 0;
973 #else
974  int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
975  return vget_lane_s64(orReduce, 0) == 0;
976 #endif
977 }
978 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
979 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
980 
982 {
983 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
984  uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
985  return minValue == ~0;
986 #else
987  int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
988  return vget_lane_s64(andReduce, 0) == ~0LL;
989 #endif
990 }
991 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
992 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))
993 
994 
996 {
997  int32x4_t temp = AKSIMD_SETV_V4I32(8, 4, 2, 1);
998  int32x4_t xvec = AKSIMD_SET_V4I32((AkInt32)x);
999  int32x4_t xand = AKSIMD_AND_V4I32(xvec, temp);
1000  return AKSIMD_EQ_V4I32(temp, xand);
1001 }
1002 
1003 
1004 //@}
1005 ////////////////////////////////////////////////////////////////////////
1006 
1007 #endif //_AKSIMD_ARM_NEON_H_
1008 
AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32(const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
Vector multiply-add operation.
Definition: AkSimd.h:692
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(uint16x4_t vecs16)
Definition: AkSimd.h:293
static AkForceInline AKSIMD_V4I32 AKSIMD_ROUND_V4F32_TO_V4I32(AKSIMD_V4F32 a)
Definition: AkSimd.h:265
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:73
float32x2x2_t AKSIMD_V2F32X2
Definition: AkSimd.h:80
static AkForceInline float32x4_t AKSIMD_SETV_V2F64(AkReal64 b, AkReal64 a)
Definition: AkSimd.h:155
static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a)
Populates the full vector with the 4 floating point elements provided.
Definition: AkSimd.h:112
#define AKSIMD_SET_V4I32(__scalar__)
Sets the four integer values to scalar
Definition: AkSimd.h:122
#define AKSIMD_DECLARE_V4F32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:99
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:817
static AkForceInline AKSIMD_V4F32 AKSIMD_NOT_V4F32(const AKSIMD_V4F32 &in_vec)
Definition: AkSimd.h:488
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:746
float32x4x4_t AKSIMD_V4F32X4
Definition: AkSimd.h:82
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:75
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:878
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:847
double AkReal64
64-bit floating point
Definition: AkTypes.h:71
static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a)
Definition: AkSimd.h:124
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:680
static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:472
int16x4_t AKSIMD_V4I16
Vector of 4 16-bit signed integers.
Definition: AkSimd.h:67
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:549
#define AKSIMD_AND_V4I32(__a__, __b__)
Definition: AkSimd.h:446
static AkForceInline AKSIMD_V4F32 AKSIMD_AND_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:480
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:736
#define AKSIMD_MUL_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:670
static AkForceInline AKSIMD_V4F32 AKSIMD_ADDSUB_V4F32(AKSIMD_V4F32 vIn1, AKSIMD_V4F32 vIn2)
Definition: AkSimd.h:764
int16_t AkInt16
Signed 16-bit integer.
Definition: AkTypes.h:63
#define AKSIMD_EQ_V4I32(__a__, __b__)
Compare each integer element and return control mask.
Definition: AkSimd.h:930
int32x2_t AKSIMD_V2I32
Vector of 2 32-bit signed integers.
Definition: AkSimd.h:70
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:76
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:77
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:72
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:793
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:71
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:650
uint32x4_t AKSIMD_V4UI32
Vector of 4 32-bit unsigned signed integers.
Definition: AkSimd.h:68
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:981
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:65
#define AKSIMD_DECLARE_V4I32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:103
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:612
uint32_t AkUInt32
Unsigned 32-bit integer.
Definition: AkTypes.h:59
float32x4x2_t AKSIMD_V4F32X2
Definition: AkSimd.h:81
int32x4x2_t AKSIMD_V4I32X2
Definition: AkSimd.h:84
int32_t AkInt32
Signed 32-bit integer.
Definition: AkTypes.h:64
int32x4x4_t AKSIMD_V4I32X4
Definition: AkSimd.h:85
uint32x2_t AKSIMD_V2UI32
Vector of 2 32-bit unsigned signed integers.
Definition: AkSimd.h:69
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:805
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Definition: AkSimd.h:995
#define AkForceInline
Definition: AkTypes.h:60
static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:464
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:968
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4UI32 &in_vec1)
Definition: AkSimd.h:943
int16x8_t AKSIMD_V8I16
Vector of 8 16-bit signed integers.
Definition: AkSimd.h:66
#define AKSIMD_ADD_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:638
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:342
static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a)
Definition: AkSimd.h:133