Version
menu_open
Wwise SDK 2023.1.4
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Copyright (c) 2024 Audiokinetic Inc.
25 *******************************************************************************/
26 
27 // AkSimd.h
28 
29 /// \file
30 /// AKSIMD - arm_neon implementation
31 
32 #pragma once
33 
34 #if defined _MSC_VER && defined _M_ARM64
35  #include <arm64_neon.h>
36 #else
37  #include <arm_neon.h>
38 #endif
39 
42 
43 ////////////////////////////////////////////////////////////////////////
44 /// @name Platform specific memory size alignment for allocation purposes
45 //@{
46 
47 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
48 
49 //@}
50 ////////////////////////////////////////////////////////////////////////
51 
52 ////////////////////////////////////////////////////////////////////////
53 /// @name AKSIMD loading / setting
54 //@{
55 
56 /// Loads four single-precision, floating-point values.
57 /// The address has no alignment requirement, (see _mm_loadu_ps).
58 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
59 
60 /// Loads four single-precision, floating-point values.
61 /// The address has no alignment requirement, (see _mm_loadu_ps).
62 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
63 
64 /// Loads a single single-precision, floating-point value, copying it into
65 /// all four words (see _mm_load1_ps, _mm_load_ps1)
66 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
67 
68 /// Sets the four single-precision, floating-point values to __scalar__ (see
69 /// _mm_set1_ps, _mm_set_ps1)
70 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
71 
72 /// Populates the full vector with the 4 floating point elements provided
73 static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a) {
74  float32x4_t ret = vdupq_n_f32(0);
75  ret = vsetq_lane_f32(d, ret, 3);
76  ret = vsetq_lane_f32(c, ret, 2);
77  ret = vsetq_lane_f32(b, ret, 1);
78  ret = vsetq_lane_f32(a, ret, 0);
79  return ret;
80 }
81 
82 /// Sets the four integer values to __scalar__
83 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
84 
85 static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a) {
86  int32x4_t ret = vdupq_n_s32(0);
87  ret = vsetq_lane_s32(d, ret, 3);
88  ret = vsetq_lane_s32(c, ret, 2);
89  ret = vsetq_lane_s32(b, ret, 1);
90  ret = vsetq_lane_s32(a, ret, 0);
91  return ret;
92 }
93 
94 static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a) {
95  // On 32b ARM, dealing with an int64_t could invoke loading in memory directly,
96  // e.g. dereferencing a 64b ptr as one of the inputs
97  // ultimately resulting in a potentially unaligned 64b load.
98  // By reinterpreting and using the 64b inputs as 32b inputs, even a load from
99  // a pointer will not have any alignment requirements
100  // ARM64 can handle dereferencing ptrs to 64b values directly safely, though
101 #if defined AK_CPU_ARM_64
102  int64x2_t ret = vdupq_n_s64(0);
103  ret = vsetq_lane_s64(b, ret, 1);
104  ret = vsetq_lane_s64(a, ret, 0);
105  return vreinterpretq_s32_s64(ret);
106 #else
107  int32x4_t ret = vdupq_n_s32(0);
108  ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
109  ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
110  ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
111  ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
112  return ret;
113 #endif
114 }
115 
117 {
118 #if defined AK_CPU_ARM_64
119  float64x2_t ret = (float64x2_t)vdupq_n_s64(0);
120  ret = vsetq_lane_f64(b, ret, 1);
121  ret = vsetq_lane_f64(a, ret, 0);
122  return (float32x4_t)(ret);
123 #else
124  int64_t a64 = *(int64_t*)&a;
125  int64_t b64 = *(int64_t*)&b;
126  int32x4_t ret = vdupq_n_s32(0);
127  ret = vsetq_lane_s32(int32_t((b64 >> 32) & 0xFFFFFFFF), ret, 3);
128  ret = vsetq_lane_s32(int32_t((b64 >> 0) & 0xFFFFFFFF), ret, 2);
129  ret = vsetq_lane_s32(int32_t((a64 >> 32) & 0xFFFFFFFF), ret, 1);
130  ret = vsetq_lane_s32(int32_t((a64 >> 0) & 0xFFFFFFFF), ret, 0);
131  return vreinterpretq_f32_s32(ret);
132 #endif
133 }
134 
135 /// Sets the four single-precision, floating-point values to zero (see
136 /// _mm_setzero_ps)
137 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
138 
139 /// Loads a single-precision, floating-point value into the low word
140 /// and clears the upper three words.
141 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
142 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 )
143 
144 /// Loads four 32-bit signed integer values (aligned)
145 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
146 
147 /// Loads 8 16-bit signed integer values (aligned)
148 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
149 
150 /// Loads 4 16-bit signed integer values (aligned)
151 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
152 
153 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
154 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
155 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
156 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
157 
158 /// Loads two single-precision, floating-point values
159 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
160 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
161 
162 /// Sets the two single-precision, floating-point values to __scalar__
163 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
164 
165 /// Sets the two single-precision, floating-point values to zero
166 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
167 
168 /// Loads data from memory and de-interleaves
169 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
170 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
171 
172 /// Loads data from memory and de-interleaves; only selected lane
173 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
174 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
175 
176 //@}
177 ////////////////////////////////////////////////////////////////////////
178 
179 
180 ////////////////////////////////////////////////////////////////////////
181 /// @name AKSIMD storing
182 //@{
183 
184 /// Stores four single-precision, floating-point values.
185 /// The address has no alignment requirement, (see _mm_storeu_ps).
186 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
187 
188 /// Stores four single-precision, floating-point values.
189 /// The address has no alignment requirement, (see _mm_storeu_ps).
190 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
191 
192 /// Stores the lower single-precision, floating-point value.
193 /// *p := a0 (see _mm_store_ss)
194 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
195 
196 /// Stores four 32-bit integer values. The address must be 16-byte aligned.
197 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
198 
199 /// Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
200 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
201 
202 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
203 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
204 
205 /// Stores two single-precision, floating-point values. The address must be 16-byte aligned.
206 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
207 
208 /// Stores data by interleaving into memory
209 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
210 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
211 
212 /// Stores the lower double-precision, floating-point value.
213 /// *p := a0 (see _mm_store_sd)
214 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) vst1q_lane_f64( (float64_t*)(__addr__), vreinterpretq_f64_f32(__vec__), 0 )
215 
216 //@}
217 ////////////////////////////////////////////////////////////////////////
218 
219 
220 ////////////////////////////////////////////////////////////////////////
221 /// @name AKSIMD conversion
222 //@{
223 
224 /// Converts the four signed 32-bit integer values of a to single-precision,
225 /// floating-point values (see _mm_cvtepi32_ps)
226 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
227 
228 /// Converts the four single-precision, floating-point values of a to signed
229 /// 32-bit integer values (see _mm_cvtps_epi32)
231 #if defined AK_CPU_ARM_64
232  return vcvtaq_s32_f32(a);
233 #else
234  // on ARMv7, need to add 0.5 (away from zero) and truncate that
235  float32x4_t halfPos = vdupq_n_f32(0.5f);
236  float32x4_t halfNeg = vdupq_n_f32(-0.5f);
237  float32x4_t zero = vdupq_n_f32(0.0f);
238  const uint32x4_t signMask = vcgtq_f32(a, zero);
239  const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
240  const float32x4_t aOffset = vaddq_f32(a, signedHalf);
241  return vcvtq_s32_f32(aOffset);
242 #endif
243 }
244 
245 /// Converts the four single-precision, floating-point values of a to signed
246 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
247 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
248 
249 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
250 /// vector to 4 full-precision floats
251 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
252 
253 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
254 /// vector to 4 full-precision floats
255 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
256 
257 
259 {
260 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
261 
262  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
263  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
264  float32x4_t ret;
265  __asm__("fcvtl %0.4s, %1.4h" \
266  : "=w"(ret) \
267  : "w"(vecs16)
268  );
269  return ret;
270 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
271  float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
272  float32x4_t ret = vcvt_f32_f16(vecf16);
273  uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
274  ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
275  return ret;
276 #elif defined(AK_CPU_ARM_64)
277  return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
278 #else
279  uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
280  uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
281  uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
282 
283  // Determine if value is denorm or not, and use that to determine how much exponent should be scaled by, and if we need post-scale fp adjustment
284  uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
285  uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
286  uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
287 
288  // Apply the exponent increment and adjust
289  uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
290  uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
291 
292  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' inf-nan
293  uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
294  uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
295  uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
296 
297  // reincorporate the sign
298  uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
299  float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
300  return assembledFloat;
301 #endif
302 }
303 
304 
305 /// Converts the 4 full-precision floats vector to 4 half-precision floats
306 /// occupying the lower bits and leaving the upper bits as zero
308 {
309 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
310  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
311  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
312  int32x4_t ret;
313  __asm__("fcvtn %1.4h, %1.4s\n" \
314  "\tmov %0.8b, %1.8b" \
315  : "=w"(ret) \
316  : "w"(vec)
317  );
318  return ret;
319 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
320  float16x4_t ret = vcvt_f16_f32(vec);
321  uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
322  ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
323  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
324 #elif defined(AK_CPU_ARM_64)
325  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
326 #else
327  uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
328  uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
329 
330  // do the processing for values that will be denormed in float16
331  // Add 0.5 to get value within range, and rounde; then move mantissa data up
332  float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
333  uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
334 
335  // processing for values that will be normal in float16
336  uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF); // -131072 + rounding bias
337  uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
338  uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
339  uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31); // Extend Lsb so that it's -1 when set
340  uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
341  uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
342 
343  // Combine the norm and subnorm paths together
344  uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23); // smallest float32 that yields a normalized float16
345  uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
346 
347  uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
348 
349  // apply inf/nan check
350  uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000)); // test if exponent bits are zero or not
351  uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
352  uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
353  uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
354  uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
355  uint32x4_t infNanData = vorrq_u32(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
356 
357  uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData); // and combine the results
358 
359  // reincorporate the original sign
360  uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
361 
362  // store results packed in lower 64 bits, and set upper 64 to zero
363  uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
364  return vreinterpretq_s32_u16(resultZip.val[1]);
365 #endif
366 }
367 
368 //@}
369 ////////////////////////////////////////////////////////////////////////
370 
371 ////////////////////////////////////////////////////////////////////////
372 /// @name AKSIMD cast
373 //@{
374 
375 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
376 /// used for compilation and does not generate any instructions, thus it has zero latency.
377 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
378 
379 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
380 /// used for compilation and does not generate any instructions, thus it has zero latency.
381 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
382 
383 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
384 /// used for compilation and does not generate any instructions, thus it has zero latency.
385 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
386 
387 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
388 /// used for compilation and does not generate any instructions, thus it has zero latency.
389 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
390 
391 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
392 /// used for compilation and does not generate any instructions, thus it has zero latency.
393 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
394 
395 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
396 /// used for compilation and does not generate any instructions, thus it has zero latency.
397 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
398 
399 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
400 
401 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
402 
403 /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4I32.
404 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) (AKSIMD_V4COND)(__vec__)
405 
406 /// Cast vector of type AKSIMD_V4I32 to AKSIMD_V4COND.
407 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) (AKSIMD_V4COND)(__vec__)
408 
409 //@}
410 ////////////////////////////////////////////////////////////////////////
411 
412 ////////////////////////////////////////////////////////////////////////
413 /// @name AKSIMD logical operations
414 //@{
415 
416 /// Computes the bitwise AND of the 128-bit value in a and the
417 /// 128-bit value in b (see _mm_and_si128)
418 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
419 
420 /// Compares the 8 signed 16-bit integers in a and the 8 signed
421 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
422 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
423  vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
424 
425 /// Compares for less than or equal (see _mm_cmple_ps)
426 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
427 
428 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
429 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
430 
431 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
432 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
433 
434 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
435 
436 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
437 {
438  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
439  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
440  uint32x4_t res = veorq_u32(t0, t1);
441  return vreinterpretq_f32_u32(res);
442 }
443 
444 static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
445 {
446  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
447  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
448  uint32x4_t res = vorrq_u32(t0, t1);
449  return vreinterpretq_f32_u32(res);
450 }
451 
453 {
454  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
455  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
456  uint32x4_t res = vandq_u32(t0, t1);
457  return vreinterpretq_f32_u32(res);
458 }
459 
461 {
462  uint32x4_t allSet = vdupq_n_u32(~0u);
463  uint32x4_t reinterpret = vreinterpretq_u32_f32(in_vec);
464  uint32x4_t result = veorq_u32(reinterpret, allSet);
465  return vreinterpretq_f32_u32(result);
466 }
467 
468 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
469 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
470 
471 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
472 //@}
473 ////////////////////////////////////////////////////////////////////////
474 
475 
476 ////////////////////////////////////////////////////////////////////////
477 /// @name AKSIMD shifting
478 //@{
479 
480 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
481 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
482 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
483  vshlq_n_s32( (__vec__), (__shiftBy__) )
484 
485 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
486 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
487 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
488  vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
489 
490 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
491 /// bits while shifting in the sign bit (see _mm_srai_epi32)
492 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
493  vshrq_n_s32( (__vec__), (__shiftBy__) )
494 
495 //@}
496 ////////////////////////////////////////////////////////////////////////
497 
498 
499 ////////////////////////////////////////////////////////////////////////
500 /// @name AKSIMD shuffling
501 //@{
502 
503 // Macro for combining two vector of 2 elements into one vector of
504 // 4 elements.
505 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
506 
507 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
508 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
509  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
510 
511 /// Selects four specific single-precision, floating-point values from
512 /// a and b, based on the mask i (see _mm_shuffle_ps)
513 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
514 // If you get a link error, it's probably because the required
515 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
516 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
517 #if defined(__clang__)
518  #if defined(__has_builtin) && __has_builtin(__builtin_shufflevector)
519  #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
520  __builtin_shufflevector(a, b, (zyxw >> 0) & 0x3, (zyxw >> 2) & 0x3, ((zyxw >> 4) & 0x3) + 4, ((zyxw >> 6) & 0x3) + 4 )
521  #endif
522 #endif
523 
524 #ifndef AKSIMD_SHUFFLE_V4F32
525 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
526  _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
527 
528 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
529 // implemented in a separate header file to keep this one cleaner:
531 
532 #endif
533 
534 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
535 
536 /// Moves the upper two single-precision, floating-point values of b to
537 /// the lower two single-precision, floating-point values of the result.
538 /// The upper two single-precision, floating-point values of a are passed
539 /// through to the result.
540 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
541 #define AKSIMD_MOVEHL_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__b__, __a__, AKSIMD_SHUFFLE(3,2,3,2))
542 
543 /// Moves the lower two single-precision, floating-point values of b to
544 /// the upper two single-precision, floating-point values of the result.
545 /// The lower two single-precision, floating-point values of a are passed
546 /// through to the result.
547 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
548 #define AKSIMD_MOVELH_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__a__, __b__, AKSIMD_SHUFFLE(1,0,1,0))
549 
550 /// Swap the 2 lower floats together and the 2 higher floats together.
551 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
552 
553 /// Swap the 2 lower floats with the 2 higher floats.
554 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
555 
556 /// Barrel-shift all floats by one.
557 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
558 
559 /// Duplicates the odd items into the even items (d c b a -> d d b b )
560 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
561 
562 /// Duplicates the even items into the odd items (d c b a -> c c a a )
563 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
564 //@}
565 ////////////////////////////////////////////////////////////////////////
566 
567 
568 ////////////////////////////////////////////////////////////////////////
569 /// @name AKSIMD arithmetic
570 //@{
571 
572 /// Subtracts the four single-precision, floating-point values of
573 /// a and b (a - b) (see _mm_sub_ps)
574 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
575 
576 /// Subtracts the two single-precision, floating-point values of
577 /// a and b
578 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
579 
580 /// Subtracts the lower single-precision, floating-point values of a and b.
581 /// The upper three single-precision, floating-point values are passed through from a.
582 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
583 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
584  vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
585 
586 /// Adds the four single-precision, floating-point values of
587 /// a and b (see _mm_add_ps)
588 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
589 
590 /// Adds the two single-precision, floating-point values of
591 /// a and b
592 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
593 
594 /// Adds the four integers of a and b
595 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
596 
597 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
598 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
599 
600 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
601 #define AKSIMD_MULLO_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
602 
603 /// Compare the content of four single-precision, floating-point values of
604 /// a and b
605 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
606 
607 /// Compare the content of two single-precision, floating-point values of
608 /// a and b
609 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
610 
611 /// Adds the lower single-precision, floating-point values of a and b; the
612 /// upper three single-precision, floating-point values are passed through from a.
613 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
614 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
615  vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
616 
617 /// Multiplies the four single-precision, floating-point values
618 /// of a and b (see _mm_mul_ps)
619 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
620 
621 /// Multiplies the four single-precision, floating-point values of a
622 /// by the single-precision, floating-point scalar b
623 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
624 
625 /// Rough estimation of division
627 {
628  AKSIMD_V4F32 inv = vrecpeq_f32(b);
629  AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
630  inv = vmulq_f32(restep, inv);
631  return vmulq_f32(a, inv);
632 }
633 
634 /// Multiplies the two single-precision, floating-point values
635 /// of a and b
636 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
637 
638 /// Multiplies the two single-precision, floating-point values of a
639 /// by the single-precision, floating-point scalar b
640 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
641 
642 /// Multiplies the lower single-precision, floating-point values of
643 /// a and b; the upper three single-precision, floating-point values
644 /// are passed through from a.
645 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
646 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
647  vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
648 
649 /// Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where appropriate)
650 #if defined(AK_CPU_ARM_64)
651  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vfmaq_f32( (__c__), (__a__), (__b__) )
652  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) vfma_f32( (__c__), (__a__), (__b__) )
653  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vfmaq_n_f32( (__c__), (__a__), (__b__) )
654  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vfma_n_f32( (__c__), (__a__), (__b__) )
655 #else
656  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
657  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
658  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
659  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
660 #endif
661 
662 /// Not a direct translation to vfmsq_f32 because that operation does -a*b+c, not a*b-c.
663 /// Explicitly adding an additional negation tends to produce worse codegen than giving the compiler a chance to re-order things slightly
664 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
665 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
666 
667 /// Vector multiply-add operation.
669 {
670  return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
671 }
672 
673 /// Computes the minima of the four single-precision, floating-point
674 /// values of a and b (see _mm_min_ps)
675 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
676 
677 /// Computes the minima of the two single-precision, floating-point
678 /// values of a and b
679 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
680 
681 /// Computes the maximums of the four single-precision, floating-point
682 /// values of a and b (see _mm_max_ps)
683 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
684 
685 /// Computes the maximums of the two single-precision, floating-point
686 /// values of a and b
687 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
688 
689 /// Returns absolute value
690 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
691 
692 /// Changes the sign
693 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
694 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
695 
696 /// Square root (4 floats)
697 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
698 
699 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
700 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
701 
702 /// Square root (2 floats)
703 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
704 
705 /// Reciprocal of x (1/x)
706 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
707 
708 /// Faked in-place vector horizontal add.
709 /// \akwarning
710 /// Don't expect this to be very efficient.
711 /// \endakwarning
713 {
714  AKSIMD_V4F32 vAb = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0xB1);
715  AKSIMD_V4F32 vHaddAb = AKSIMD_ADD_V4F32(vVec, vAb);
716  AKSIMD_V4F32 vHaddCd = AKSIMD_SHUFFLE_V4F32(vHaddAb, vHaddAb, 0x4E);
717  AKSIMD_V4F32 vHaddAbcd = AKSIMD_ADD_V4F32(vHaddAb, vHaddCd);
718  return vHaddAbcd;
719 }
720 
721 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
723 {
724 #ifdef AKSIMD_DECLARE_V4F32
725  static const AKSIMD_DECLARE_V4F32( vSign, -0.f, 0.f, -0.f, 0.f);
726 #else
727  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
728 #endif
729 
730  float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2); // val[0] will be reals extended, val[1] will be imag
731  vC2Ext.val[1] = AKSIMD_XOR_V4F32(vC2Ext.val[1], vSign);
732  float32x4_t vC1Rev = vrev64q_f32(vCIn1);
733  float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
734  float32x4_t vFinal = AKSIMD_MADD_V4F32(vC1Rev, vC2Ext.val[1], vMul);
735  return vFinal;
736 }
737 
738 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
739 // to/from packed elements in b, and store the results in dst.
741 {
742 #ifdef AKSIMD_DECLARE_V4F32
743  static const AKSIMD_DECLARE_V4F32(vSign, -0.f, 0.f, -0.f, 0.f);
744 #else
745  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
746 #endif
747  float32x4_t vIn2SignFlip = AKSIMD_XOR_V4F32(vIn2, vSign);
748  return vaddq_f32(vIn1, vIn2SignFlip);
749 }
750 
751 //@}
752 ////////////////////////////////////////////////////////////////////////
753 
754 
755 ////////////////////////////////////////////////////////////////////////
756 /// @name AKSIMD packing / unpacking
757 //@{
758 
759 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
760 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
761 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
762 
763 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
764 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
765 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
766 
767 /// Selects and interleaves the lower two single-precision, floating-point
768 /// values from a and b (see _mm_unpacklo_ps)
770 {
771  // sce_vectormath_xayb(in_vec1, in_vec2)
772  float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
773  float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
774  float32x2x2_t xa_yb = vtrn_f32( xy, ab );
775  AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
776  return xayb;
777 }
778 
779 /// Selects and interleaves the upper two single-precision, floating-point
780 /// values from a and b (see _mm_unpackhi_ps)
782 {
783  //return sce_vectormath_zcwd( in_vec1, in_vec2 );
784  float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
785  float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
786  float32x2x2_t zc_wd = vtrn_f32( zw, cd );
787  AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
788  return zcwd;
789 }
790 
791 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
792 /// integers and saturates (see _mm_packs_epi32)
794 {
795  int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
796  int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
797  int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
798  return vreinterpretq_s32_s16( result );
799 }
800 
801 /// V1 = {a,b} => VR = {b,c}
802 /// V2 = {c,d} =>
803 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
804 
805 /// V1 = {a,b} => V1 = {a,c}
806 /// V2 = {c,d} => V2 = {b,d}
807 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
808 
809 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
810 
811 /// V1 = {a,b} => VR = {b,a}
812 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
813 
814 // Given four pointers, gathers 32-bits of data from each location,
815 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
816 // e.g. (*addr[0]) := (b a)
817 // e.g. (*addr[1]) := (d c)
818 // e.g. (*addr[2]) := (f e)
819 // e.g. (*addr[3]) := (h g)
820 // return struct has
821 // val[0] := (g e c a)
822 // val[1] := (h f d b)
824 {
825  int16x4x2_t ret16{
826  vdup_n_s16(0),
827  vdup_n_s16(0)
828  };
829 
830  ret16 = vld2_lane_s16(addr0, ret16, 0);
831  ret16 = vld2_lane_s16(addr1, ret16, 1);
832  ret16 = vld2_lane_s16(addr2, ret16, 2);
833  ret16 = vld2_lane_s16(addr3, ret16, 3);
834 
835  AKSIMD_V4I32X2 ret{
836  vmovl_s16(ret16.val[0]),
837  vmovl_s16(ret16.val[1])
838  };
839  return ret;
840 }
841 
842 // Given four pointers, gathers 64-bits of data from each location,
843 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
844 // e.g. (*addr[0]) := (d c b a)
845 // e.g. (*addr[1]) := (h g f e)
846 // e.g. (*addr[2]) := (l k j i)
847 // e.g. (*addr[3]) := (p o n m)
848 // return struct has
849 // val[0] := (m i e a)
850 // val[1] := (n j f b)
851 // val[2] := (o k g c)
852 // val[3] := (p l h d)
853 
855 {
856  int16x4x4_t ret16{
857  vdup_n_s16(0),
858  vdup_n_s16(0),
859  vdup_n_s16(0),
860  vdup_n_s16(0)
861  };
862 
863  ret16 = vld4_lane_s16(addr0, ret16, 0);
864  ret16 = vld4_lane_s16(addr1, ret16, 1);
865  ret16 = vld4_lane_s16(addr2, ret16, 2);
866  ret16 = vld4_lane_s16(addr3, ret16, 3);
867 
868  AKSIMD_V4I32X4 ret{
869  vmovl_s16(ret16.val[0]),
870  vmovl_s16(ret16.val[1]),
871  vmovl_s16(ret16.val[2]),
872  vmovl_s16(ret16.val[3])
873  };
874  return ret;
875 }
876 
877 //@}
878 ////////////////////////////////////////////////////////////////////////
879 
880 ////////////////////////////////////////////////////////////////////////
881 /// @name AKSIMD vector comparison
882 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
883 //@{
884 
885 #define AKSIMD_CMP_CTRLMASK uint32x4_t
886 
887 /// Compare each float element and return control mask.
888 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
889 
890 /// Compare each float element and return control mask.
891 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
892 
893 /// Compare each float element and return control mask.
894 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
895 
896 /// Compare each float element and return control mask.
897 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
898 
899 /// Compare each integer element and return control mask.
900 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
901 
902 /// Compare each float element and return control mask.
903 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
904 
905 /// Compare each integer element and return control mask.
906 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
907 
908 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
909 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
910 
911 // (cond1 >= cond2) ? b : a.
912 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
913 
914 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
915 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
916 
917 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
918 
919 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4UI32& in_vec1 )
920 {
921 #ifdef AKSIMD_DECLARE_V4F32
922  static const AKSIMD_DECLARE_V4I32(movemask, 1, 2, 4, 8);
923  static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
924 #else
925  static const uint32x4_t movemask = { 1, 2, 4, 8 };
926  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
927 #endif
928 
929  uint32x4_t t0 = in_vec1;
930  uint32x4_t t1 = vtstq_u32(t0, highbit);
931  uint32x4_t t2 = vandq_u32(t1, movemask);
932  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
933  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
934 }
935 
936 #ifndef AK_WIN
937 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec1 )
938 {
939  return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
940 }
941 #endif
942 
943 // returns true if every element of the provided vector is zero
945 {
946 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
947  uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
948  return maxValue == 0;
949 #else
950  int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
951  return vget_lane_s64(orReduce, 0) == 0;
952 #endif
953 }
954 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
955 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
956 
958 {
959 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
960  uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
961  return minValue == ~0;
962 #else
963  int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
964  return vget_lane_s64(andReduce, 0) == ~0LL;
965 #endif
966 }
967 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
968 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))
969 
970 
972 {
973  int32x4_t temp = AKSIMD_SETV_V4I32(8, 4, 2, 1);
974  int32x4_t xvec = AKSIMD_SET_V4I32((AkInt32)x);
975  int32x4_t xand = AKSIMD_AND_V4I32(xvec, temp);
976  return AKSIMD_EQ_V4I32(temp, xand);
977 }
978 
979 
980 //@}
981 ////////////////////////////////////////////////////////////////////////
982 
AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32(const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
Vector multiply-add operation.
Definition: AkSimd.h:668
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(uint16x4_t vecs16)
Definition: AkSimd.h:258
static AkForceInline AKSIMD_V4I32 AKSIMD_ROUND_V4F32_TO_V4I32(AKSIMD_V4F32 a)
Definition: AkSimd.h:230
uint32x4_t AKSIMD_V4UI32
Vector of 4 32-bit unsigned signed integers.
Definition: AkSimdTypes.h:57
static AkForceInline float32x4_t AKSIMD_SETV_V2F64(AkReal64 b, AkReal64 a)
Definition: AkSimd.h:116
static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a)
Populates the full vector with the 4 floating point elements provided.
Definition: AkSimd.h:73
#define AKSIMD_SET_V4I32(__scalar__)
Sets the four integer values to scalar
Definition: AkSimd.h:83
#define AKSIMD_DECLARE_V4F32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:103
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:793
static AkForceInline AKSIMD_V4F32 AKSIMD_NOT_V4F32(const AKSIMD_V4F32 &in_vec)
Definition: AkSimd.h:460
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:722
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:854
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:823
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimdTypes.h:62
static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a)
Definition: AkSimd.h:85
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:656
static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:444
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:525
#define AKSIMD_AND_V4I32(__a__, __b__)
Definition: AkSimd.h:418
static AkForceInline AKSIMD_V4F32 AKSIMD_AND_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:452
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:712
#define AKSIMD_MUL_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:646
int32_t AkInt32
Signed 32-bit integer.
static AkForceInline AKSIMD_V4F32 AKSIMD_ADDSUB_V4F32(AKSIMD_V4F32 vIn1, AKSIMD_V4F32 vIn2)
Definition: AkSimd.h:740
#define AKSIMD_EQ_V4I32(__a__, __b__)
Compare each integer element and return control mask.
Definition: AkSimd.h:906
int16_t AkInt16
Signed 16-bit integer.
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:769
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:626
double AkReal64
64-bit floating point
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:957
#define AKSIMD_DECLARE_V4I32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:107
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:588
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimdTypes.h:54
uint32_t AkUInt32
Unsigned 32-bit integer.
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimdTypes.h:64
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:781
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Definition: AkSimd.h:971
#define AkForceInline
Definition: AkTypes.h:63
static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:436
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:944
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4UI32 &in_vec1)
Definition: AkSimd.h:919
#define AKSIMD_ADD_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:614
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:307
static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a)
Definition: AkSimd.h:94

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise