Version
menu_open
link
Wwise SDK 2019.2.15
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: <VERSION> Build: <BUILDNUMBER>
25  Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - arm_neon implementation
32 
33 #ifndef _AKSIMD_ARM_NEON_H_
34 #define _AKSIMD_ARM_NEON_H_
35 
36 #if defined _MSC_VER && defined _M_ARM64
37  #include <arm64_neon.h>
38 #else
39  #include <arm_neon.h>
40 #endif
42 
43 // Platform specific defines for prefetching
44 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
45 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
46 #if defined __clang__ || defined __GNUC__
47 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) __builtin_prefetch(((char *)(__add__))+(__offset__))
48 #else
49 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ )
50 #endif
51 
52 ////////////////////////////////////////////////////////////////////////
53 /// @name Platform specific memory size alignment for allocation purposes
54 //@{
55 
56 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
57 
58 //@}
59 ////////////////////////////////////////////////////////////////////////
60 
61 ////////////////////////////////////////////////////////////////////////
62 /// @name AKSIMD types
63 //@{
64 
65 typedef int32x4_t AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
66 typedef int16x8_t AKSIMD_V8I16; ///< Vector of 8 16-bit signed integers
67 typedef int16x4_t AKSIMD_V4I16; ///< Vector of 4 16-bit signed integers
68 typedef uint32x4_t AKSIMD_V4UI32; ///< Vector of 4 32-bit unsigned signed integers
69 typedef uint32x2_t AKSIMD_V2UI32; ///< Vector of 2 32-bit unsigned signed integers
70 typedef int32x2_t AKSIMD_V2I32; ///< Vector of 2 32-bit signed integers
71 typedef float32_t AKSIMD_F32; ///< 32-bit float
72 typedef float32x2_t AKSIMD_V2F32; ///< Vector of 2 32-bit floats
73 typedef float32x4_t AKSIMD_V4F32; ///< Vector of 4 32-bit floats
74 
75 typedef uint32x4_t AKSIMD_V4COND; ///< Vector of 4 comparison results
76 typedef uint32x4_t AKSIMD_V4ICOND; ///< Vector of 4 comparison results
77 typedef uint32x4_t AKSIMD_V4FCOND; ///< Vector of 4 comparison results
78 
79 #if defined(AK_CPU_ARM_NEON)
80 typedef float32x2x2_t AKSIMD_V2F32X2;
81 typedef float32x4x2_t AKSIMD_V4F32X2;
82 typedef float32x4x4_t AKSIMD_V4F32X4;
83 
84 typedef int32x4x2_t AKSIMD_V4I32X2;
85 typedef int32x4x4_t AKSIMD_V4I32X4;
86 #endif
87 
88 //@}
89 ////////////////////////////////////////////////////////////////////////
90 
91 ////////////////////////////////////////////////////////////////////////
92 /// @name AKSIMD loading / setting
93 //@{
94 
95 /// Loads four single-precision, floating-point values (see _mm_load_ps)
96 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
97 
98 /// Loads four single-precision floating-point values from unaligned
99 /// memory (see _mm_loadu_ps)
100 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
101 
102 /// Loads a single single-precision, floating-point value, copying it into
103 /// all four words (see _mm_load1_ps, _mm_load_ps1)
104 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
105 
106 /// Sets the four single-precision, floating-point values to __scalar__ (see
107 /// _mm_set1_ps, _mm_set_ps1)
108 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
109 
110 /// Sets the four integer values to __scalar__
111 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
112 
113 static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a) {
114  int32x4_t ret = vdupq_n_s32(0);
115  ret = vsetq_lane_s32(d, ret, 3);
116  ret = vsetq_lane_s32(c, ret, 2);
117  ret = vsetq_lane_s32(b, ret, 1);
118  ret = vsetq_lane_s32(a, ret, 0);
119  return ret;
120 }
121 
122 static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a) {
123  // On 32b ARM, dealing with an int64_t could invoke loading in memory directly,
124  // e.g. dereferencing a 64b ptr as one of the inputs
125  // ultimately resulting in a potentially unaligned 64b load.
126  // By reinterpreting and using the 64b inputs as 32b inputs, even a load from
127  // a pointer will not have any alignment requirements
128  // ARM64 can handle dereferencing ptrs to 64b values directly safely, though
129 #if defined AK_CPU_ARM_64
130  int64x2_t ret = vdupq_n_s64(0);
131  ret = vsetq_lane_s64(b, ret, 1);
132  ret = vsetq_lane_s64(a, ret, 0);
133  return vreinterpretq_s32_s64(ret);
134 #else
135  int32x4_t ret = vdupq_n_s32(0);
136  ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
137  ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
138  ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
139  ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
140  return ret;
141 #endif
142 }
143 
144 /// Sets the four single-precision, floating-point values to zero (see
145 /// _mm_setzero_ps)
146 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
147 
148 /// Loads a single-precision, floating-point value into the low word
149 /// and clears the upper three words.
150 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
151 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
152 
153 /// Loads four 32-bit signed integer values (aligned)
154 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
155 
156 /// Loads 8 16-bit signed integer values (aligned)
157 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
158 
159 /// Loads 4 16-bit signed integer values (aligned)
160 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
161 
162 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
163 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
164 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
165 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
166 
167 /// Loads two single-precision, floating-point values
168 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
169 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
170 
171 /// Sets the two single-precision, floating-point values to __scalar__
172 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
173 
174 /// Sets the two single-precision, floating-point values to zero
175 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
176 
177 /// Loads data from memory and de-interleaves
178 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
179 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
180 
181 /// Loads data from memory and de-interleaves; only selected lane
182 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
183 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
184 
185 //@}
186 ////////////////////////////////////////////////////////////////////////
187 
188 
189 ////////////////////////////////////////////////////////////////////////
190 /// @name AKSIMD storing
191 //@{
192 
193 /// Stores four single-precision, floating-point values. The address must be 16-byte aligned
194 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
195 
196 /// Stores four single-precision, floating-point values. The address does not need to be 16-byte aligned.
197 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
198 
199 /// Stores the lower single-precision, floating-point value.
200 /// *p := a0 (see _mm_store_ss)
201 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
202 
203 /// Stores four 32-bit integer values. The address must be 16-byte aligned.
204 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
205 
206 /// Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
207 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
208 
209 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
210 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
211 
212 /// Stores two single-precision, floating-point values. The address must be 16-byte aligned.
213 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
214 
215 /// Stores data by interleaving into memory
216 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
217 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
218 
219 //@}
220 ////////////////////////////////////////////////////////////////////////
221 
222 
223 ////////////////////////////////////////////////////////////////////////
224 /// @name AKSIMD conversion
225 //@{
226 
227 /// Converts the four signed 32-bit integer values of a to single-precision,
228 /// floating-point values (see _mm_cvtepi32_ps)
229 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
230 
231 /// Converts the four single-precision, floating-point values of a to signed
232 /// 32-bit integer values (see _mm_cvtps_epi32)
234 #if defined AK_CPU_ARM_64
235  return vcvtaq_s32_f32(a);
236 #else
237  // on ARMv7, need to add 0.5 (away from zero) and truncate that
238  float32x4_t halfPos = vdupq_n_f32(0.5f);
239  float32x4_t halfNeg = vdupq_n_f32(-0.5f);
240  float32x4_t zero = vdupq_n_f32(0.0f);
241  const uint32x4_t signMask = vcgtq_f32(a, zero);
242  const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
243  const float32x4_t aOffset = vaddq_f32(a, signedHalf);
244  return vcvtq_s32_f32(aOffset);
245 #endif
246 }
247 
248 /// Converts the four single-precision, floating-point values of a to signed
249 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
250 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
251 
252 /// Converts the 4 half-precision floats in the lower 64-bits of the provided
253 /// vector to 4 full-precision floats
254 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
255 
256 /// Converts the 4 half-precision floats in the upper 64-bits of the provided
257 /// vector to 4 full-precision floats
258 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
259 
260 
262 {
263 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
264 
265  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
266  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
267  float32x4_t ret;
268  __asm__("fcvtl %0.4s, %1.4h" \
269  : "=w"(ret) \
270  : "w"(vecs16)
271  );
272  return ret;
273 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
274  float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
275  float32x4_t ret = vcvt_f32_f16(vecf16);
276  uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
277  ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
278  return ret;
279 #elif defined(AK_CPU_ARM_64)
280  return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
281 #else
282  uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
283  uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
284  uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3); // shift so that the float16 exp/mant is now split along float32's bounds
285 
286  // Determine if value is denorm or not, and use that to determine how much exponent should be scaled by, and if we need post-scale fp adjustment
287  uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
288  uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
289  uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
290 
291  // Apply the exponent increment and adjust
292  uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
293  uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
294 
295  // if fp16 val was inf or nan, preserve the inf/nan exponent field (we can just 'or' inf-nan
296  uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
297  uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
298  uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
299 
300  // reincorporate the sign
301  uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
302  float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
303  return assembledFloat;
304 #endif
305 }
306 
307 
308 /// Converts the 4 full-precision floats vector to 4 half-precision floats
309 /// occupying the lower bits and leaving the upper bits as zero
311 {
312 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
313  // float16 intrinsics were added in gcc 6.1, and we still use gcc4.9 on Android
314  // all compilers that we support for arm64 - i.e. clang/msvc - support the intrinsics just fine
315  int32x4_t ret;
316  __asm__("fcvtn %1.4h, %1.4s\n" \
317  "\tmov %0.8b, %1.8b" \
318  : "=w"(ret) \
319  : "w"(vec)
320  );
321  return ret;
322 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
323  float16x4_t ret = vcvt_f16_f32(vec);
324  uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
325  ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
326  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
327 #elif defined(AK_CPU_ARM_64)
328  return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
329 #else
330  uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
331  uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
332 
333  // do the processing for values that will be denormed in float16
334  // Add 0.5 to get value within range, and rounde; then move mantissa data up
335  float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
336  uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
337 
338  // processing for values that will be normal in float16
339  uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF); // -131072 + rounding bias
340  uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
341  uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
342  uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31); // Extend Lsb so that it's -1 when set
343  uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb); // and subtract the sign-extended bit to finish rounding up
344  uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
345 
346  // Combine the norm and subnorm paths together
347  uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23); // smallest float32 that yields a normalized float16
348  uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
349 
350  uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
351 
352  // apply inf/nan check
353  uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000)); // test if exponent bits are zero or not
354  uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
355  uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec)); // mark the parts of the vector where we have a mantissa (i.e. NAN) as 0xffffffff
356  uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000)); // set the NaN mantissa bit if mantissa suggests this is NaN
357  uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000)); // grab the exponent data from unsigned vec with no mantissa
358  uint32x4_t infNanData = vorrq_u32(infData, nantissaBit); // if we have a non-zero mantissa, add the NaN mantissa bit
359 
360  uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData); // and combine the results
361 
362  // reincorporate the original sign
363  uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
364 
365  // store results packed in lower 64 bits, and set upper 64 to zero
366  uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
367  return vreinterpretq_s32_u16(resultZip.val[1]);
368 #endif
369 }
370 
371 //@}
372 ////////////////////////////////////////////////////////////////////////
373 
374 ////////////////////////////////////////////////////////////////////////
375 /// @name AKSIMD cast
376 //@{
377 
378 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4F32. This intrinsic is only
379 /// used for compilation and does not generate any instructions, thus it has zero latency.
380 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
381 
382 /// Cast vector of type AKSIMD_V2F64 to type AKSIMD_V4I32. This intrinsic is only
383 /// used for compilation and does not generate any instructions, thus it has zero latency.
384 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
385 
386 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V2F64. This intrinsic is only
387 /// used for compilation and does not generate any instructions, thus it has zero latency.
388 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
389 
390 /// Cast vector of type AKSIMD_V4F32 to type AKSIMD_V4I32. This intrinsic is only
391 /// used for compilation and does not generate any instructions, thus it has zero latency.
392 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
393 
394 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V2F64. This intrinsic is only
395 /// used for compilation and does not generate any instructions, thus it has zero latency.
396 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
397 
398 /// Cast vector of type AKSIMD_V4I32 to type AKSIMD_V4F32. This intrinsic is only
399 /// used for compilation and does not generate any instructions, thus it has zero latency.
400 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
401 
402 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
403 
404 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
405 
406 //@}
407 ////////////////////////////////////////////////////////////////////////
408 
409 ////////////////////////////////////////////////////////////////////////
410 /// @name AKSIMD logical operations
411 //@{
412 
413 /// Computes the bitwise AND of the 128-bit value in a and the
414 /// 128-bit value in b (see _mm_and_si128)
415 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
416 
417 /// Compares the 8 signed 16-bit integers in a and the 8 signed
418 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
419 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
420  vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
421 
422 /// Compares for less than or equal (see _mm_cmple_ps)
423 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
424 
425 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
426 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
427 
428 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
429 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
430 
431 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
432 
433 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
434 {
435  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
436  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
437  uint32x4_t res = veorq_u32(t0, t1);
438  return vreinterpretq_f32_u32(res);
439 }
440 
441 static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
442 {
443  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
444  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
445  uint32x4_t res = vorrq_u32(t0, t1);
446  return vreinterpretq_f32_u32(res);
447 }
448 
449 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
450 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
451 
452 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
453 //@}
454 ////////////////////////////////////////////////////////////////////////
455 
456 
457 ////////////////////////////////////////////////////////////////////////
458 /// @name AKSIMD shifting
459 //@{
460 
461 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
462 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
463 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
464  vshlq_n_s32( (__vec__), (__shiftBy__) )
465 
466 /// Shifts the 4 signed or unsigned 32-bit integers in a right by
467 /// in_shiftBy bits while shifting in zeros (see _mm_srli_epi32)
468 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
469  vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
470 
471 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
472 /// bits while shifting in the sign bit (see _mm_srai_epi32)
473 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
474  vshrq_n_s32( (__vec__), (__shiftBy__) )
475 
476 //@}
477 ////////////////////////////////////////////////////////////////////////
478 
479 
480 ////////////////////////////////////////////////////////////////////////
481 /// @name AKSIMD shuffling
482 //@{
483 
484 // Macro for combining two vector of 2 elements into one vector of
485 // 4 elements.
486 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
487 
488 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
489 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
490  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
491 
492 /// Selects four specific single-precision, floating-point values from
493 /// a and b, based on the mask i (see _mm_shuffle_ps)
494 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
495 // If you get a link error, it's probably because the required
496 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
497 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
498 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
499  _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
500 
501 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
502 
503 /// Barrel-shift all floats by one.
504 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
505 
506 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
507 // implemented in a separate header file to keep this one cleaner:
509 
510 /// Moves the upper two single-precision, floating-point values of b to
511 /// the lower two single-precision, floating-point values of the result.
512 /// The upper two single-precision, floating-point values of a are passed
513 /// through to the result.
514 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
516 {
517  //return akshuffle_zwcd( xyzw, abcd );
518  AKSIMD_V2F32 zw = vget_high_f32( xyzw );
519  AKSIMD_V2F32 cd = vget_high_f32( abcd );
520  AKSIMD_V4F32 zwcd = vcombine_f32( zw , cd );
521  return zwcd;
522 }
523 
524 /// Moves the lower two single-precision, floating-point values of b to
525 /// the upper two single-precision, floating-point values of the result.
526 /// The lower two single-precision, floating-point values of a are passed
527 /// through to the result.
528 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
529 inline AKSIMD_V4F32 AKSIMD_MOVELH_V4F32( const AKSIMD_V4F32& xyzw, const AKSIMD_V4F32& abcd )
530 {
531  return vcombine_f32( vget_low_f32( xyzw ) , vget_low_f32( abcd ) );
532 }
533 
534 /// Swap the 2 lower floats together and the 2 higher floats together.
535 //#define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
536 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
537 
538 /// Swap the 2 lower floats with the 2 higher floats.
539 //#define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
540 #define AKSIMD_SHUFFLE_CDAB( __a__ ) vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
541 
542 /// Duplicates the odd items into the even items (d c b a -> d d b b )
543 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
544 
545 /// Duplicates the even items into the odd items (d c b a -> c c a a )
546 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
547 
548 //@}
549 ////////////////////////////////////////////////////////////////////////
550 
551 
552 ////////////////////////////////////////////////////////////////////////
553 /// @name AKSIMD arithmetic
554 //@{
555 
556 /// Subtracts the four single-precision, floating-point values of
557 /// a and b (a - b) (see _mm_sub_ps)
558 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
559 
560 /// Subtracts the two single-precision, floating-point values of
561 /// a and b
562 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
563 
564 /// Subtracts the lower single-precision, floating-point values of a and b.
565 /// The upper three single-precision, floating-point values are passed through from a.
566 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
567 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
568  vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
569 
570 /// Adds the four single-precision, floating-point values of
571 /// a and b (see _mm_add_ps)
572 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
573 
574 /// Adds the two single-precision, floating-point values of
575 /// a and b
576 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
577 
578 /// Adds the four integers of a and b
579 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
580 
581 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
582 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
583 
584 /// Compare the content of four single-precision, floating-point values of
585 /// a and b
586 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
587 
588 /// Compare the content of two single-precision, floating-point values of
589 /// a and b
590 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
591 
592 /// Adds the lower single-precision, floating-point values of a and b; the
593 /// upper three single-precision, floating-point values are passed through from a.
594 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
595 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
596  vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
597 
598 /// Multiplies the four single-precision, floating-point values
599 /// of a and b (see _mm_mul_ps)
600 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
601 
602 /// Multiplies the four single-precision, floating-point values of a
603 /// by the single-precision, floating-point scalar b
604 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
605 
606 /// Rough estimation of division
608 {
609  AKSIMD_V4F32 inv = vrecpeq_f32(b);
610  AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
611  inv = vmulq_f32(restep, inv);
612  return vmulq_f32(a, inv);
613 }
614 
615 /// Multiplies the two single-precision, floating-point values
616 /// of a and b
617 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
618 
619 /// Multiplies the two single-precision, floating-point values of a
620 /// by the single-precision, floating-point scalar b
621 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
622 
623 /// Multiplies the lower single-precision, floating-point values of
624 /// a and b; the upper three single-precision, floating-point values
625 /// are passed through from a.
626 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
627 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
628  vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
629 
630 /// Vector multiply-add operation.
631 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
632 
633 /// Vector multiply-substract operation. Careful: vmlsq_f32 does c-(a*b) and not the expected (a*b)-c
634 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) \
635  AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
636 
637 
638 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) \
639  AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
640 
641 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) \
642  AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
643 
644 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
645 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
646 
647 /// Vector multiply-add operation.
649 {
650  return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
651 }
652 
653 /// Computes the minima of the four single-precision, floating-point
654 /// values of a and b (see _mm_min_ps)
655 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
656 
657 /// Computes the minima of the two single-precision, floating-point
658 /// values of a and b
659 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
660 
661 /// Computes the maximums of the four single-precision, floating-point
662 /// values of a and b (see _mm_max_ps)
663 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
664 
665 /// Computes the maximums of the two single-precision, floating-point
666 /// values of a and b
667 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
668 
669 /// Returns absolute value
670 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
671 
672 /// Changes the sign
673 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
674 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
675 
676 /// Square root (4 floats)
677 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
678 
679 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
680 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
681 
682 /// Square root (2 floats)
683 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
684 
685 /// Reciprocal of x (1/x)
686 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
687 
688 /// Faked in-place vector horizontal add.
689 /// \akwarning
690 /// Don't expect this to be very efficient.
691 /// \endakwarning
693 {
694  AKSIMD_V4F32 vAb = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0xB1);
695  AKSIMD_V4F32 vHaddAb = AKSIMD_ADD_V4F32(vVec, vAb);
696  AKSIMD_V4F32 vHaddCd = AKSIMD_SHUFFLE_V4F32(vHaddAb, vHaddAb, 0x4E);
697  AKSIMD_V4F32 vHaddAbcd = AKSIMD_ADD_V4F32(vHaddAb, vHaddCd);
698  return vHaddAbcd;
699 }
700 
701 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
703 {
704 #ifdef AKSIMD_DECLARE_V4F32
705  static const AKSIMD_DECLARE_V4F32( vSign, -0.f, 0.f, -0.f, 0.f);
706 #else
707  static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
708 #endif
709 
710  float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2); // val[0] will be reals extended, val[1] will be imag
711  vC2Ext.val[1] = AKSIMD_XOR_V4F32(vC2Ext.val[1], vSign);
712  float32x4_t vC1Rev = vrev64q_f32(vCIn1);
713  float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
714  float32x4_t vFinal = vmlaq_f32(vMul, vC1Rev, vC2Ext.val[1]);
715  return vFinal;
716 }
717 
718 //@}
719 ////////////////////////////////////////////////////////////////////////
720 
721 
722 ////////////////////////////////////////////////////////////////////////
723 /// @name AKSIMD packing / unpacking
724 //@{
725 
726 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
727 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
728 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
729 
730 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
731 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
732 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
733 
734 /// Selects and interleaves the lower two single-precision, floating-point
735 /// values from a and b (see _mm_unpacklo_ps)
737 {
738  // sce_vectormath_xayb(in_vec1, in_vec2)
739  float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
740  float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
741  float32x2x2_t xa_yb = vtrn_f32( xy, ab );
742  AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
743  return xayb;
744 }
745 
746 /// Selects and interleaves the upper two single-precision, floating-point
747 /// values from a and b (see _mm_unpackhi_ps)
749 {
750  //return sce_vectormath_zcwd( in_vec1, in_vec2 );
751  float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
752  float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
753  float32x2x2_t zc_wd = vtrn_f32( zw, cd );
754  AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
755  return zcwd;
756 }
757 
758 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
759 /// integers and saturates (see _mm_packs_epi32)
761 {
762  int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
763  int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
764  int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
765  return vreinterpretq_s32_s16( result );
766 }
767 
768 /// V1 = {a,b} => VR = {b,c}
769 /// V2 = {c,d} =>
770 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
771 
772 /// V1 = {a,b} => V1 = {a,c}
773 /// V2 = {c,d} => V2 = {b,d}
774 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
775 
776 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
777 
778 /// V1 = {a,b} => VR = {b,a}
779 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
780 
781 // Given four pointers, gathers 32-bits of data from each location,
782 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
783 // e.g. (*addr[0]) := (b a)
784 // e.g. (*addr[1]) := (d c)
785 // e.g. (*addr[2]) := (f e)
786 // e.g. (*addr[3]) := (h g)
787 // return struct has
788 // val[0] := (g e c a)
789 // val[1] := (h f d b)
791 {
792  int16x4x2_t ret16{
793  vdup_n_s16(0),
794  vdup_n_s16(0)
795  };
796 
797  ret16 = vld2_lane_s16(addr0, ret16, 0);
798  ret16 = vld2_lane_s16(addr1, ret16, 1);
799  ret16 = vld2_lane_s16(addr2, ret16, 2);
800  ret16 = vld2_lane_s16(addr3, ret16, 3);
801 
802  AKSIMD_V4I32X2 ret{
803  vmovl_s16(ret16.val[0]),
804  vmovl_s16(ret16.val[1])
805  };
806  return ret;
807 }
808 
809 // Given four pointers, gathers 64-bits of data from each location,
810 // deinterleaves them as 16-bits of each, and sign-extends to 32-bits
811 // e.g. (*addr[0]) := (d c b a)
812 // e.g. (*addr[1]) := (h g f e)
813 // e.g. (*addr[2]) := (l k j i)
814 // e.g. (*addr[3]) := (p o n m)
815 // return struct has
816 // val[0] := (m i e a)
817 // val[1] := (n j f b)
818 // val[2] := (o k g c)
819 // val[3] := (p l h d)
820 
822 {
823  int16x4x4_t ret16{
824  vdup_n_s16(0),
825  vdup_n_s16(0),
826  vdup_n_s16(0),
827  vdup_n_s16(0)
828  };
829 
830  ret16 = vld4_lane_s16(addr0, ret16, 0);
831  ret16 = vld4_lane_s16(addr1, ret16, 1);
832  ret16 = vld4_lane_s16(addr2, ret16, 2);
833  ret16 = vld4_lane_s16(addr3, ret16, 3);
834 
835  AKSIMD_V4I32X4 ret{
836  vmovl_s16(ret16.val[0]),
837  vmovl_s16(ret16.val[1]),
838  vmovl_s16(ret16.val[2]),
839  vmovl_s16(ret16.val[3])
840  };
841  return ret;
842 }
843 
844 //@}
845 ////////////////////////////////////////////////////////////////////////
846 
847 ////////////////////////////////////////////////////////////////////////
848 /// @name AKSIMD vector comparison
849 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
850 //@{
851 
852 #define AKSIMD_CMP_CTRLMASK uint32x4_t
853 
854 /// Compare each float element and return control mask.
855 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
856 
857 /// Compare each float element and return control mask.
858 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
859 
860 /// Compare each float element and return control mask.
861 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
862 
863 /// Compare each float element and return control mask.
864 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
865 
866 /// Compare each integer element and return control mask.
867 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
868 
869 /// Compare each float element and return control mask.
870 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
871 
872 /// Compare each integer element and return control mask.
873 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
874 
875 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
876 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
877 
878 // (cond1 >= cond2) ? b : a.
879 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
880 
881 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
882 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
883 
884 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
885 
886 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4UI32& in_vec1 )
887 {
888 #ifdef AKSIMD_DECLARE_V4F32
889  static const AKSIMD_DECLARE_V4I32(movemask, 1, 2, 4, 8);
890  static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
891 #else
892  static const uint32x4_t movemask = { 1, 2, 4, 8 };
893  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
894 #endif
895 
896  uint32x4_t t0 = in_vec1;
897  uint32x4_t t1 = vtstq_u32(t0, highbit);
898  uint32x4_t t2 = vandq_u32(t1, movemask);
899  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
900  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
901 }
902 
903 #ifndef AK_WIN
904 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec1 )
905 {
906  return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
907 }
908 #endif
909 
910 // returns true if every element of the provided vector is zero
912 {
913 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
914  uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
915  return maxValue == 0;
916 #else
917  int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
918  return vget_lane_s64(orReduce, 0) == 0;
919 #endif
920 }
921 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
922 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
923 
925 {
926 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
927  uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
928  return minValue == ~0;
929 #else
930  int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
931  return vget_lane_s64(andReduce, 0) == ~0LL;
932 #endif
933 }
934 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
935 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))
936 
937 
939 {
940  int32x4_t temp = AKSIMD_SETV_V4I32(8, 4, 2, 1);
941  int32x4_t xvec = AKSIMD_SET_V4I32((AkInt32)x);
942  int32x4_t xand = AKSIMD_AND_V4I32(xvec, temp);
943  return AKSIMD_EQ_V4I32(temp, xand);
944 }
945 
946 
947 //@}
948 ////////////////////////////////////////////////////////////////////////
949 
950 #endif //_AKSIMD_ARM_NEON_H_
951 
AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32(const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
Vector multiply-add operation.
Definition: AkSimd.h:648
static AkForceInline AKSIMD_V4F32 AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER(uint16x4_t vecs16)
Definition: AkSimd.h:261
static AkForceInline AKSIMD_V4I32 AKSIMD_ROUND_V4F32_TO_V4I32(AKSIMD_V4F32 a)
Definition: AkSimd.h:233
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:73
float32x2x2_t AKSIMD_V2F32X2
Definition: AkSimd.h:80
#define AKSIMD_SET_V4I32(__scalar__)
Sets the four integer values to scalar
Definition: AkSimd.h:111
#define AKSIMD_DECLARE_V4F32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:99
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:760
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL_V4F32(AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:702
float32x4x4_t AKSIMD_V4F32X4
Definition: AkSimd.h:82
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:75
static AkForceInline AKSIMD_V4I32X4 AKSIMD_GATHER_V4I64_AND_DEINTERLEAVE_V4I32X4(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:821
static AkForceInline AKSIMD_V4I32X2 AKSIMD_GATHER_V4I32_AND_DEINTERLEAVE_V4I32X2(AkInt16 *addr3, AkInt16 *addr2, AkInt16 *addr1, AkInt16 *addr0)
Definition: AkSimd.h:790
static AkForceInline int32x4_t AKSIMD_SETV_V4I32(int32_t d, int32_t c, int32_t b, int32_t a)
Definition: AkSimd.h:113
static AkForceInline AKSIMD_V4F32 AKSIMD_OR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:441
int16x4_t AKSIMD_V4I16
Vector of 4 16-bit signed integers.
Definition: AkSimd.h:67
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:498
#define AKSIMD_AND_V4I32(__a__, __b__)
Definition: AkSimd.h:415
static AkForceInline AKSIMD_V4F32 AKSIMD_HORIZONTALADD_V4F32(AKSIMD_V4F32 vVec)
Definition: AkSimd.h:692
#define AKSIMD_MUL_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:627
AKSIMD_V4F32 AKSIMD_MOVELH_V4F32(const AKSIMD_V4F32 &xyzw, const AKSIMD_V4F32 &abcd)
Definition: AkSimd.h:529
#define AKSIMD_EQ_V4I32(__a__, __b__)
Compare each integer element and return control mask.
Definition: AkSimd.h:873
int32x2_t AKSIMD_V2I32
Vector of 2 32-bit signed integers.
Definition: AkSimd.h:70
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:76
int32_t AkInt32
Signed 32-bit integer.
Definition: AkTypes.h:98
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:77
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:72
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:736
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:71
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:607
uint32x4_t AKSIMD_V4UI32
Vector of 4 32-bit unsigned signed integers.
Definition: AkSimd.h:68
int16_t AkInt16
Signed 16-bit integer.
Definition: AkTypes.h:97
static AkForceInline bool AKSIMD_TESTONES_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:924
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:65
#define AKSIMD_DECLARE_V4I32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:103
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:572
AKSIMD_V4F32 AKSIMD_MOVEHL_V4F32(const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw)
Definition: AkSimd.h:515
float32x4x2_t AKSIMD_V4F32X2
Definition: AkSimd.h:81
int32x4x2_t AKSIMD_V4I32X2
Definition: AkSimd.h:84
int32x4x4_t AKSIMD_V4I32X4
Definition: AkSimd.h:85
uint32_t AkUInt32
Unsigned 32-bit integer.
Definition: AkTypes.h:85
uint32x2_t AKSIMD_V2UI32
Vector of 2 32-bit unsigned signed integers.
Definition: AkSimd.h:69
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:748
static AkForceInline AKSIMD_V4COND AKSIMD_SETMASK_V4COND(AkUInt32 x)
Definition: AkSimd.h:938
#define AkForceInline
Definition: AkTypes.h:62
static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:433
static AkForceInline bool AKSIMD_TESTZERO_V4I32(AKSIMD_V4I32 a)
Definition: AkSimd.h:911
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4UI32 &in_vec1)
Definition: AkSimd.h:886
int16x8_t AKSIMD_V8I16
Vector of 8 16-bit signed integers.
Definition: AkSimd.h:66
#define AKSIMD_ADD_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:595
static AkForceInline AKSIMD_V4I32 AKSIMD_CONVERT_V4F32_TO_V4F16(AKSIMD_V4F32 vec)
Definition: AkSimd.h:310
static AkForceInline int32x4_t AKSIMD_SETV_V2I64(int64_t b, int64_t a)
Definition: AkSimd.h:122

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise