Version

menu_open
Wwise SDK 2024.1.4
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Copyright (c) 2025 Audiokinetic Inc.
25 *******************************************************************************/
26 
27 // AkSimd.h
28 
29 /// \file
30 /// AKSIMD - arm_neon implementation
31 
32 #pragma once
33 
34 #if defined _MSC_VER && defined _M_ARM64
35  #include <arm64_neon.h>
36 #else
37  #include <arm_neon.h>
38 #endif
39 
42 
43 ////////////////////////////////////////////////////////////////////////
44 /// @name Platform specific memory size alignment for allocation purposes
45 //@{
46 
47 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
48 
49 //@}
50 ////////////////////////////////////////////////////////////////////////
51 
52 ////////////////////////////////////////////////////////////////////////
53 /// @name AKSIMD loading / setting
54 //@{
55 
56 /// Loads four single-precision, floating-point values.
57 /// The address has no alignment requirement, (see _mm_loadu_ps).
58 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
59 
60 /// Loads four single-precision, floating-point values.
61 /// The address has no alignment requirement, (see _mm_loadu_ps).
62 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
63 
64 /// Loads a single single-precision, floating-point value, copying it into
65 /// all four words (see _mm_load1_ps, _mm_load_ps1)
66 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
67 
68 /// Sets the four single-precision, floating-point values to __scalar__ (see
69 /// _mm_set1_ps, _mm_set_ps1)
70 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
71 
72 /// Populates the full vector with the 4 floating point elements provided
73 static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a) {
74  float32x4_t ret = vdupq_n_f32(0);
75  ret = vsetq_lane_f32(d, ret, 3);
76  ret = vsetq_lane_f32(c, ret, 2);
77  ret = vsetq_lane_f32(b, ret, 1);
78  ret = vsetq_lane_f32(a, ret, 0);
79  return ret;
80 }
81 
82 /// Sets the four integer values to __scalar__
83 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
84 
85 /// Sets the sixteen 8-bit ints to __scalar__
86 #define AKSIMD_SET_V16I8( __scalar__ ) vdupq_n_s8( __scalar__ )
87 
88 static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a) {
89  int32x4_t ret = vdupq_n_s32(0);
90  ret = vsetq_lane_s32(d, ret, 3);
91  ret = vsetq_lane_s32(c, ret, 2);
92  ret = vsetq_lane_s32(b, ret, 1);
93  ret = vsetq_lane_s32(a, ret, 0);
94  return ret;
95 }
96 
97 static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a) {
98  // On 32b ARM, dealing with an int64_t could invoke loading in memory directly,
99  // e.g. dereferencing a 64b ptr as one of the inputs
100  // ultimately resulting in a potentially unaligned 64b load.
101  // By reinterpreting and using the 64b inputs as 32b inputs, even a load from
102  // a pointer will not have any alignment requirements
103  // ARM64 can handle dereferencing ptrs to 64b values directly safely, though
104 #if defined AK_CPU_ARM_64
105  int64x2_t ret = vdupq_n_s64(0);
106  ret = vsetq_lane_s64(b, ret, 1);
107  ret = vsetq_lane_s64(a, ret, 0);
108  return vreinterpretq_s32_s64(ret);
109 #else
110  int32x4_t ret = vdupq_n_s32(0);
111  ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
112  ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
113  ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
114  ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
115  return ret;
116 #endif
117 }
118 
120 {
121 #if defined AK_CPU_ARM_64
122  float64x2_t ret = (float64x2_t)vdupq_n_s64(0);
123  ret = vsetq_lane_f64(b, ret, 1);
124  ret = vsetq_lane_f64(a, ret, 0);
125  return (float32x4_t)(ret);
126 #else
127  int64_t a64 = *(int64_t*)&a;
128  int64_t b64 = *(int64_t*)&b;
129  int32x4_t ret = vdupq_n_s32(0);
130  ret = vsetq_lane_s32(int32_t((b64 >> 32) & 0xFFFFFFFF), ret, 3);
131  ret = vsetq_lane_s32(int32_t((b64 >> 0) & 0xFFFFFFFF), ret, 2);
132  ret = vsetq_lane_s32(int32_t((a64 >> 32) & 0xFFFFFFFF), ret, 1);
133  ret = vsetq_lane_s32(int32_t((a64 >> 0) & 0xFFFFFFFF), ret, 0);
134  return vreinterpretq_f32_s32(ret);
135 #endif
136 }
137 
138 /// Sets the four single-precision, floating-point values to zero (see
139 /// _mm_setzero_ps)
140 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
141 
142 /// Loads a single-precision, floating-point value into the low word
143 /// and clears the upper three words.
144 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
145 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 )
146 
147 /// Loads four 32-bit signed integer values (aligned)
148 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
149 
150 /// Loads 8 16-bit signed integer values (aligned)
151 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
152 
153 /// Loads 4 16-bit signed integer values (aligned)
154 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
155 
156 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
157 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
158 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
159 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
160 
161 /// Loads two single-precision, floating-point values
162 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
163 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
164 
165 /// Sets the two single-precision, floating-point values to __scalar__
166 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
167 
168 /// Sets the two single-precision, floating-point values to zero
169 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
170 
171 /// Loads data from memory and de-interleaves
172 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
173 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
174 
175 /// Loads data from memory and de-interleaves; only selected lane
176 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
177 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
178 
179 //@}
180 ////////////////////////////////////////////////////////////////////////
181 
182 
183 ////////////////////////////////////////////////////////////////////////
184 /// @name AKSIMD storing
185 //@{
186 
187 /// Stores four single-precision, floating-point values.
188 /// The address has no alignment requirement, (see _mm_storeu_ps).
189 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
190 
191 /// Stores four single-precision, floating-point values.
192 /// The address has no alignment requirement, (see _mm_storeu_ps).
193 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
194 
195 /// Stores the lower single-precision, floating-point value.
196 /// *p := a0 (see _mm_store_ss)
197 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
198 
199 /// Stores four 32-bit integer values. The address must be 16-byte aligned.
200 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
201 
202 /// Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
203 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
204 
205 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
206 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
207 
208 /// Stores two single-precision, floating-point values. The address must be 16-byte aligned.
209 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
210 
211 /// Stores data by interleaving into memory
212 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
213 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
214 
215 /// Stores the lower double-precision, floating-point value.
216 /// *p := a0 (see _mm_store_sd)
217 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) vst1q_lane_f64( (float64_t*)(__addr__), vreinterpretq_f64_f32(__vec__), 0 )
218 
219 //@}
220 ////////////////////////////////////////////////////////////////////////
221 
222 
223 ////////////////////////////////////////////////////////////////////////
224 /// @name AKSIMD conversion
225 //@{
226 
227 /// Converts the four signed 32-bit integer values of a to single-precision,
228 /// floating-point values (see _mm_cvtepi32_ps)
229 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
230 
231 /// Converts the four single-precision, floating-point values of a to signed
232 /// 32-bit integer values (see _mm_cvtps_epi32)
234 #if defined AK_CPU_ARM_64
235  return vcvtaq_s32_f32(a);
236 #else
237  // on ARMv7, need to add 0.5 (away from zero) and truncate that
238  float32x4_t halfPos = vdupq_n_f32(0.5f);
239  float32x4_t halfNeg = vdupq_n_f32(-0.5f);
240  float32x4_t zero = vdupq_n_f32(0.0f);
241  const uint32x4_t signMask = vcgtq_f32(a, zero);
242  const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
243  const float32x4_t aOffset = vaddq_f32(a, signedHalf);
244  return vcvtq_s32_f32(aOffset);
245 #endif
246 }
247 
248 /// Converts the four single-precision, floating-point values of a to signed
249 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
250 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
251 
252 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
253 /// vector to 4 full-precision floats
254 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
255 
256 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
257 /// vector to 4 full-precision floats
258 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
259 
260 
262 {
263 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
264 
265  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
266  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
267  float32x4_t ret;
268  __asm__("fcvtl %0.4s, %1.4h" \
269  : "=w"(ret) \
270  : "w"(vecs16)
271  );
272  return ret;
273 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
274  float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
275  float32x4_t ret = vcvt_f32_f16(vecf16);
276  uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
277  ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
278  return ret;
279 #elif defined(AK_CPU_ARM_64)
280  return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
281 #else
282  uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
283  uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
284  uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
285 
286  // Determine if value is denorm or not, and use that to determine how much exponent should be scaled by, and if we need post-scale fp adjustment
287  uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
288  uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
289  uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
290 
291  // Apply the exponent increment and adjust
292  uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
293  uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
294 
295  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' inf-nan
296  uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
297  uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
298  uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
299 
300  // reincorporate the sign
301  uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
302  float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
303  return assembledFloat;
304 #endif
305 }
306 
307 
308 /// Converts the 4 full-precision floats vector to 4 half-precision floats
309 /// occupying the lower bits and leaving the upper bits as zero
311 {
312 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
313  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
314  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
315  int32x4_t ret;
316  __asm__("fcvtn %1.4h, %1.4s\n" \
317  "\tmov %0.8b, %1.8b" \
318  : "=w"(ret) \
319  : "w"(vec)
320  );
321  return ret;
322 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
323  float16x4_t ret = vcvt_f16_f32(vec);
324  uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
325  ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
326  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
327 #elif defined(AK_CPU_ARM_64)
328  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
329 #else
330  uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
331  uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
332 
333  // do the processing for values that will be denormed in float16
334  // Add 0.5 to get value within range, and rounde; then move mantissa data up
335  float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
336  uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
337 
338  // processing for values that will be normal in float16
339  uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF); // -131072 + rounding bias
340  uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
341  uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
342  uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31); // Extend Lsb so that it's -1 when set
343  uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
344  uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
345 
346  // Combine the norm and subnorm paths together
347  uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23); // smallest float32 that yields a normalized float16
348  uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
349 
350  uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
351 
352  // apply inf/nan check
353  uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000)); // test if exponent bits are zero or not
354  uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
355  uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
356  uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
357  uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
358  uint32x4_t infNanData = vorrq_u32(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
359 
360  uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData); // and combine the results
361 
362  // reincorporate the original sign
363  uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
364 
365  // store results packed in lower 64 bits, and set upper 64 to zero
366  uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
367  return vreinterpretq_s32_u16(resultZip.val[1]);
368 #endif
369 }
370 
371 //@}
372 ////////////////////////////////////////////////////////////////////////
373 
374 ////////////////////////////////////////////////////////////////////////
375 /// @name AKSIMD cast
376 //@{
377 
378 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
379 /// used for compilation and does not generate any instructions, thus it has zero latency.
380 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
381 
382 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
383 /// used for compilation and does not generate any instructions, thus it has zero latency.
384 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
385 
386 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
387 /// used for compilation and does not generate any instructions, thus it has zero latency.
388 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
389 
390 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
391 /// used for compilation and does not generate any instructions, thus it has zero latency.
392 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
393 
394 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
395 /// used for compilation and does not generate any instructions, thus it has zero latency.
396 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
397 
398 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
399 /// used for compilation and does not generate any instructions, thus it has zero latency.
400 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
401 
402 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
403 
404 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
405 
406 /// Cast vector of type AKSIMD_V4COND to AKSIMD_V4I32.
407 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) (AKSIMD_V4COND)(__vec__)
408 
409 /// Cast vector of type AKSIMD_V4I32 to AKSIMD_V4COND.
410 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) (AKSIMD_V4COND)(__vec__)
411 
412 //@}
413 ////////////////////////////////////////////////////////////////////////
414 
415 ////////////////////////////////////////////////////////////////////////
416 /// @name AKSIMD logical operations
417 //@{
418 
419 /// Computes the bitwise AND of the 128-bit value in a and the
420 /// 128-bit value in b (see _mm_and_si128)
421 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
422 
423 /// Compares the 8 signed 16-bit integers in a and the 8 signed
424 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
425 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
426  vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
427 
428 /// Compares for less than or equal (see _mm_cmple_ps)
429 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
430 
431 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
432 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
433 
434 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
435 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
436 
437 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
438 
439 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
440 {
441  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
442  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
443  uint32x4_t res = veorq_u32(t0, t1);
444  return vreinterpretq_f32_u32(res);
445 }
446 
447 static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
448 {
449  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
450  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
451  uint32x4_t res = vorrq_u32(t0, t1);
452  return vreinterpretq_f32_u32(res);
453 }
454 
456 {
457  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
458  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
459  uint32x4_t res = vandq_u32(t0, t1);
460  return vreinterpretq_f32_u32(res);
461 }
462 
464 {
465  uint32x4_t allSet = vdupq_n_u32(~0u);
466  uint32x4_t reinterpret = vreinterpretq_u32_f32(in_vec);
467  uint32x4_t result = veorq_u32(reinterpret, allSet);
468  return vreinterpretq_f32_u32(result);
469 }
470 
471 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
472 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
473 
474 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
475 //@}
476 ////////////////////////////////////////////////////////////////////////
477 
478 
479 ////////////////////////////////////////////////////////////////////////
480 /// @name AKSIMD shifting
481 //@{
482 
483 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
484 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
485 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
486  vshlq_n_s32( (__vec__), (__shiftBy__) )
487 
488 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
489 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
490 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
491  vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
492 
493 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
494 /// bits while shifting in the sign bit (see _mm_srai_epi32)
495 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
496  vshrq_n_s32( (__vec__), (__shiftBy__) )
497 
498 //@}
499 ////////////////////////////////////////////////////////////////////////
500 
501 
502 ////////////////////////////////////////////////////////////////////////
503 /// @name AKSIMD shuffling
504 //@{
505 
506 // Macro for combining two vector of 2 elements into one vector of
507 // 4 elements.
508 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
509 
510 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
511 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
512  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
513 
514 /// Selects four specific single-precision, floating-point values from
515 /// a and b, based on the mask i (see _mm_shuffle_ps)
516 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
517 // If you get a link error, it's probably because the required
518 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
519 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
520 #if defined(__clang__)
521  #if defined(__has_builtin) && __has_builtin(__builtin_shufflevector)
522  #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
523  __builtin_shufflevector(a, b, (zyxw >> 0) & 0x3, (zyxw >> 2) & 0x3, ((zyxw >> 4) & 0x3) + 4, ((zyxw >> 6) & 0x3) + 4 )
524  #endif
525 #endif
526 
527 #ifndef AKSIMD_SHUFFLE_V4F32
528 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
529  _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
530 
531 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
532 // implemented in a separate header file to keep this one cleaner:
534 
535 #endif
536 
537 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
538 
539 /// Moves the upper two single-precision, floating-point values of b to
540 /// the lower two single-precision, floating-point values of the result.
541 /// The upper two single-precision, floating-point values of a are passed
542 /// through to the result.
543 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
544 #define AKSIMD_MOVEHL_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__b__, __a__, AKSIMD_SHUFFLE(3,2,3,2))
545 
546 /// Moves the lower two single-precision, floating-point values of b to
547 /// the upper two single-precision, floating-point values of the result.
548 /// The lower two single-precision, floating-point values of a are passed
549 /// through to the result.
550 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
551 #define AKSIMD_MOVELH_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__a__, __b__, AKSIMD_SHUFFLE(1,0,1,0))
552 
553 /// Swap the 2 lower floats together and the 2 higher floats together.
554 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
555 
556 /// Swap the 2 lower floats with the 2 higher floats.
557 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
558 
559 /// Barrel-shift all floats by one.
560 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
561 
562 /// Duplicates the odd items into the even items (d c b a -> d d b b )
563 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
564 
565 /// Duplicates the even items into the odd items (d c b a -> c c a a )
566 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
567 //@}
568 ////////////////////////////////////////////////////////////////////////
569 
570 
571 ////////////////////////////////////////////////////////////////////////
572 /// @name AKSIMD arithmetic
573 //@{
574 
575 /// Subtracts the four single-precision, floating-point values of
576 /// a and b (a - b) (see _mm_sub_ps)
577 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
578 
579 /// Subtracts the two single-precision, floating-point values of
580 /// a and b
581 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
582 
583 /// Subtracts the lower single-precision, floating-point values of a and b.
584 /// The upper three single-precision, floating-point values are passed through from a.
585 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
586 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
587  vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
588 
589 /// Adds the four single-precision, floating-point values of
590 /// a and b (see _mm_add_ps)
591 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
592 
593 /// Adds the two single-precision, floating-point values of
594 /// a and b
595 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
596 
597 /// Adds the four integers of a and b
598 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
599 
600 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
601 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
602 
603 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
604 #define AKSIMD_MULLO_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
605 
606 /// Compare the content of four single-precision, floating-point values of
607 /// a and b
608 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
609 
610 /// Compare the content of two single-precision, floating-point values of
611 /// a and b
612 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
613 
614 /// Adds the lower single-precision, floating-point values of a and b; the
615 /// upper three single-precision, floating-point values are passed through from a.
616 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
617 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
618  vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
619 
620 /// Multiplies the four single-precision, floating-point values
621 /// of a and b (see _mm_mul_ps)
622 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
623 
624 /// Multiplies the four single-precision, floating-point values of a
625 /// by the single-precision, floating-point scalar b
626 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
627 
628 /// Rough estimation of division
630 {
631  AKSIMD_V4F32 inv = vrecpeq_f32(b);
632  AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
633  inv = vmulq_f32(restep, inv);
634  return vmulq_f32(a, inv);
635 }
636 
637 /// Multiplies the two single-precision, floating-point values
638 /// of a and b
639 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
640 
641 /// Multiplies the two single-precision, floating-point values of a
642 /// by the single-precision, floating-point scalar b
643 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
644 
645 /// Multiplies the lower single-precision, floating-point values of
646 /// a and b; the upper three single-precision, floating-point values
647 /// are passed through from a.
648 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
649 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
650  vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
651 
652 /// Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where appropriate)
653 #if defined(AK_CPU_ARM_64)
654  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vfmaq_f32( (__c__), (__a__), (__b__) )
655  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) vfma_f32( (__c__), (__a__), (__b__) )
656  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vfmaq_n_f32( (__c__), (__a__), (__b__) )
657  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vfma_n_f32( (__c__), (__a__), (__b__) )
658 #else
659  #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
660  #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
661  #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
662  #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
663 #endif
664 
665 /// Not a direct translation to vfmsq_f32 because that operation does -a*b+c, not a*b-c.
666 /// Explicitly adding an additional negation tends to produce worse codegen than giving the compiler a chance to re-order things slightly
667 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
668 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
669 
670 /// Vector multiply-add operation.
672 {
673  return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
674 }
675 
676 /// Computes the minima of the four single-precision, floating-point
677 /// values of a and b (see _mm_min_ps)
678 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
679 
680 /// Computes the minima of the two single-precision, floating-point
681 /// values of a and b
682 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
683 
684 /// Computes the maximums of the four single-precision, floating-point
685 /// values of a and b (see _mm_max_ps)
686 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
687 
688 /// Computes the maximums of the two single-precision, floating-point
689 /// values of a and b
690 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
691 
692 /// Returns absolute value
693 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
694 
695 /// Changes the sign
696 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
697 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
698 
699 /// Square root (4 floats)
700 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
701 
702 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
703 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
704 
705 /// Square root (2 floats)
706 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
707 
708 /// Reciprocal of x (1/x)
709 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
710 
711 /// Faked in-place vector horizontal add.
712 /// \akwarning
713 /// Don't expect this to be very efficient.
714 /// \endakwarning
716 {
717  AKSIMD_V4F32 vAb = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0xB1);
718  AKSIMD_V4F32 vHaddAb = AKSIMD_ADD_V4F32(vVec, vAb);
719  AKSIMD_V4F32 vHaddCd = AKSIMD_SHUFFLE_V4F32(vHaddAb, vHaddAb, 0x4E);
720  AKSIMD_V4F32 vHaddAbcd = AKSIMD_ADD_V4F32(vHaddAb, vHaddCd);
721  return vHaddAbcd;
722 }
723 
724 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
726 {
727  static const AKSIMD_V4F32 vSign = AKSIMD_SETV_V4F32( 0.f, -0.f, 0.f, -0.f );
728 
729  float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2); // val[0] will be reals extended, val[1] will be imag
730  vC2Ext.val[1] = AKSIMD_XOR_V4F32(vC2Ext.val[1], vSign);
731  float32x4_t vC1Rev = vrev64q_f32(vCIn1);
732  float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
733  float32x4_t vFinal = AKSIMD_MADD_V4F32(vC1Rev, vC2Ext.val[1], vMul);
734  return vFinal;
735 }
736 
737 // Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a
738 // to/from packed elements in b, and store the results in dst.
740 {
741  static const AKSIMD_V4F32 vSign = AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f);
742  float32x4_t vIn2SignFlip = AKSIMD_XOR_V4F32(vIn2, vSign);
743  return vaddq_f32(vIn1, vIn2SignFlip);
744 }
745 
746 //@}
747 ////////////////////////////////////////////////////////////////////////
748 
749 
750 ////////////////////////////////////////////////////////////////////////
751 /// @name AKSIMD packing / unpacking
752 //@{
753 
754 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
755 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
756 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
757 
758 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
759 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
760 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
761 
762 /// Selects and interleaves the lower two single-precision, floating-point
763 /// values from a and b (see _mm_unpacklo_ps)
765 {
766  // sce_vectormath_xayb(in_vec1, in_vec2)
767  float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
768  float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
769  float32x2x2_t xa_yb = vtrn_f32( xy, ab );
770  AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
771  return xayb;
772 }
773 
774 /// Selects and interleaves the upper two single-precision, floating-point
775 /// values from a and b (see _mm_unpackhi_ps)
777 {
778  //return sce_vectormath_zcwd( in_vec1, in_vec2 );
779  float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
780  float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
781  float32x2x2_t zc_wd = vtrn_f32( zw, cd );
782  AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
783  return zcwd;
784 }
785 
786 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
787 /// integers and saturates (see _mm_packs_epi32)
789 {
790  int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
791  int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
792  int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
793  return vreinterpretq_s32_s16( result );
794 }
795 
796 /// V1 = {a,b} => VR = {b,c}
797 /// V2 = {c,d} =>
798 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
799 
800 /// V1 = {a,b} => V1 = {a,c}
801 /// V2 = {c,d} => V2 = {b,d}
802 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
803 
804 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
805 
806 /// V1 = {a,b} => VR = {b,a}
807 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
808 
809 // Given four pointers, gathers 32-bits of data from each location,
810 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
811 // e.g. (*addr[0]) := (b a)
812 // e.g. (*addr[1]) := (d c)
813 // e.g. (*addr[2]) := (f e)
814 // e.g. (*addr[3]) := (h g)
815 // return struct has
816 // val[0] := (g e c a)
817 // val[1] := (h f d b)
819 {
820  int16x4x2_t ret16{
821  vdup_n_s16(0),
822  vdup_n_s16(0)
823  };
824 
825  ret16 = vld2_lane_s16(addr0, ret16, 0);
826  ret16 = vld2_lane_s16(addr1, ret16, 1);
827  ret16 = vld2_lane_s16(addr2, ret16, 2);
828  ret16 = vld2_lane_s16(addr3, ret16, 3);
829 
830  AKSIMD_V4I32X2 ret{
831  vmovl_s16(ret16.val[0]),
832  vmovl_s16(ret16.val[1])
833  };
834  return ret;
835 }
836 
837 // Given four pointers, gathers 64-bits of data from each location,
838 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
839 // e.g. (*addr[0]) := (d c b a)
840 // e.g. (*addr[1]) := (h g f e)
841 // e.g. (*addr[2]) := (l k j i)
842 // e.g. (*addr[3]) := (p o n m)
843 // return struct has
844 // val[0] := (m i e a)
845 // val[1] := (n j f b)
846 // val[2] := (o k g c)
847 // val[3] := (p l h d)
848 
850 {
851  int16x4x4_t ret16{
852  vdup_n_s16(0),
853  vdup_n_s16(0),
854  vdup_n_s16(0),
855  vdup_n_s16(0)
856  };
857 
858  ret16 = vld4_lane_s16(addr0, ret16, 0);
859  ret16 = vld4_lane_s16(addr1, ret16, 1);
860  ret16 = vld4_lane_s16(addr2, ret16, 2);
861  ret16 = vld4_lane_s16(addr3, ret16, 3);
862 
863  AKSIMD_V4I32X4 ret{
864  vmovl_s16(ret16.val[0]),
865  vmovl_s16(ret16.val[1]),
866  vmovl_s16(ret16.val[2]),
867  vmovl_s16(ret16.val[3])
868  };
869  return ret;
870 }
871 
872 //@}
873 ////////////////////////////////////////////////////////////////////////
874 
875 ////////////////////////////////////////////////////////////////////////
876 /// @name AKSIMD vector comparison
877 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
878 //@{
879 
880 #define AKSIMD_CMP_CTRLMASK uint32x4_t
881 
882 /// Compare each float element and return control mask.
883 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
884 
885 /// Compare each float element and return control mask.
886 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
887 
888 /// Compare each float element and return control mask.
889 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
890 
891 /// Compare each float element and return control mask.
892 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
893 
894 /// Compare each integer element and return control mask.
895 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
896 
897 /// Compare each float element and return control mask.
898 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
899 
900 /// Compare each integer element and return control mask.
901 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
902 
903 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
904 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
905 
906 // (cond1 >= cond2) ? b : a.
907 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
908 
909 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
910 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
911 
912 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
913 
914 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4UI32& in_vec1 )
915 {
916  static const uint32x4_t movemask = vreinterpretq_u32_s32(AKSIMD_SETV_V4I32( 8, 4, 2, 1 ));
917  static const uint32x4_t highbit = vreinterpretq_u32_s32(AKSIMD_SETV_V4I32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ));
918 
919  uint32x4_t t0 = in_vec1;
920  uint32x4_t t1 = vtstq_u32(t0, highbit);
921  uint32x4_t t2 = vandq_u32(t1, movemask);
922  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
923  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
924 }
925 
926 #ifndef AK_WIN
927 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec1 )
928 {
929  return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
930 }
931 #endif
932 
933 // returns true if every element of the provided vector is zero
935 {
936 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
937  uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
938  return maxValue == 0;
939 #else
940  int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
941  return vget_lane_s64(orReduce, 0) == 0;
942 #endif
943 }
944 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
945 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
946 
948 {
949 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
950  uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
951  return minValue == ~0;
952 #else
953  int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
954  return vget_lane_s64(andReduce, 0) == ~0LL;
955 #endif
956 }
957 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
958 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))
959 
960 
962 {
963  int32x4_t temp = AKSIMD_SETV_V4I32(8, 4, 2, 1);
964  int32x4_t xvec = AKSIMD_SET_V4I32((AkInt32)x);
965  int32x4_t xand = AKSIMD_AND_V4I32(xvec, temp);
966  return AKSIMD_EQ_V4I32(temp, xand);
967 }
968 
969 
970 //@}
971 ////////////////////////////////////////////////////////////////////////
972 
AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32(const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
Vector multiply-add operation.
Definition: AkSimd.h:671
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(uint16x4_t vecs16)
Definition: AkSimd.h:261
static AkForceInline AKSIMD_V4I32 AKSIMD_ROUND_V4F32_TO_V4I32(AKSIMD_V4F32 a)
Definition: AkSimd.h:233
uint32x4_t AKSIMD_V4UI32
Vector of 4 32-bit unsigned signed integers.
Definition: AkSimdTypes.h:57
static AkForceInline float32x4_t AKSIMD_SETV_V2F64(AkReal64 b, AkReal64 a)
Definition: AkSimd.h:119
static AkForceInline float32x4_t AKSIMD_SETV_V4F32(float32_t d, float32_t c, float32_t b, float32_t a)
Populates the full vector with the 4 floating point elements provided.
Definition: AkSimd.h:73
#define AKSIMD_SET_V4I32(__scalar__)
Sets the four integer values to scalar
Definition: AkSimd.h:83
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:788
static AkForceInline AKSIMD_V4F32 AKSIMD_NOT_V4F32(const AKSIMD_V4F32 &in_vec)
Definition: AkSimd.h:463
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:725
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:849
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:818
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimdTypes.h:62
static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a)
Definition: AkSimd.h:88
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:659
static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:447
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:528
#define AKSIMD_AND_V4I32(__a__, __b__)
Definition: AkSimd.h:421
static AkForceInline AKSIMD_V4F32 AKSIMD_AND_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:455
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:715
#define AKSIMD_MUL_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:649
int32_t AkInt32
Signed 32-bit integer.
static AkForceInline AKSIMD_V4F32 AKSIMD_ADDSUB_V4F32(AKSIMD_V4F32 vIn1, AKSIMD_V4F32 vIn2)
Definition: AkSimd.h:739
#define AKSIMD_EQ_V4I32(__a__, __b__)
Compare each integer element and return control mask.
Definition: AkSimd.h:901
int16_t AkInt16
Signed 16-bit integer.
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:764
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:629
double AkReal64
64-bit floating point
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:947
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:591
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimdTypes.h:54
uint32_t AkUInt32
Unsigned 32-bit integer.
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimdTypes.h:64
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:776
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Definition: AkSimd.h:961
#define AkForceInline
Definition: AkTypes.h:63
static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:439
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:934
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4UI32 &in_vec1)
Definition: AkSimd.h:914
#define AKSIMD_ADD_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:617
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:310
static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a)
Definition: AkSimd.h:97

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise