34 #if defined _MSC_VER && defined _M_ARM64
35 #include <arm64_neon.h>
47 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
58 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
62 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
66 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
70 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
74 float32x4_t ret = vdupq_n_f32(0);
75 ret = vsetq_lane_f32(d, ret, 3);
76 ret = vsetq_lane_f32(c, ret, 2);
77 ret = vsetq_lane_f32(b, ret, 1);
78 ret = vsetq_lane_f32(a, ret, 0);
83 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
86 #define AKSIMD_SET_V16I8( __scalar__ ) vdupq_n_s8( __scalar__ )
89 int32x4_t ret = vdupq_n_s32(0);
90 ret = vsetq_lane_s32(d, ret, 3);
91 ret = vsetq_lane_s32(c, ret, 2);
92 ret = vsetq_lane_s32(b, ret, 1);
93 ret = vsetq_lane_s32(a, ret, 0);
104 #if defined AK_CPU_ARM_64
105 int64x2_t ret = vdupq_n_s64(0);
106 ret = vsetq_lane_s64(b, ret, 1);
107 ret = vsetq_lane_s64(a, ret, 0);
108 return vreinterpretq_s32_s64(ret);
110 int32x4_t ret = vdupq_n_s32(0);
111 ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
112 ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
113 ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
114 ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
121 #if defined AK_CPU_ARM_64
122 float64x2_t ret = (float64x2_t)vdupq_n_s64(0);
123 ret = vsetq_lane_f64(b, ret, 1);
124 ret = vsetq_lane_f64(a, ret, 0);
125 return (float32x4_t)(ret);
127 int64_t a64 = *(int64_t*)&a;
128 int64_t b64 = *(int64_t*)&b;
129 int32x4_t ret = vdupq_n_s32(0);
130 ret = vsetq_lane_s32(int32_t((b64 >> 32) & 0xFFFFFFFF), ret, 3);
131 ret = vsetq_lane_s32(int32_t((b64 >> 0) & 0xFFFFFFFF), ret, 2);
132 ret = vsetq_lane_s32(int32_t((a64 >> 32) & 0xFFFFFFFF), ret, 1);
133 ret = vsetq_lane_s32(int32_t((a64 >> 0) & 0xFFFFFFFF), ret, 0);
134 return vreinterpretq_f32_s32(ret);
140 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
145 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 )
148 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
151 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
154 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
157 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
159 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
162 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
163 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
166 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
169 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
172 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
173 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
176 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
177 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
189 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
193 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
197 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
200 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
203 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
206 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
209 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
212 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
213 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
217 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) vst1q_lane_f64( (float64_t*)(__addr__), vreinterpretq_f64_f32(__vec__), 0 )
229 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
234 #if defined AK_CPU_ARM_64
235 return vcvtaq_s32_f32(a);
238 float32x4_t halfPos = vdupq_n_f32(0.5f);
239 float32x4_t halfNeg = vdupq_n_f32(-0.5f);
240 float32x4_t zero = vdupq_n_f32(0.0f);
241 const uint32x4_t signMask = vcgtq_f32(a, zero);
242 const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
243 const float32x4_t aOffset = vaddq_f32(a, signedHalf);
244 return vcvtq_s32_f32(aOffset);
250 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
254 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
258 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
263 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
268 __asm__(
"fcvtl %0.4s, %1.4h" \
273 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
274 float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
275 float32x4_t ret = vcvt_f32_f16(vecf16);
276 uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
277 ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
279 #elif defined(AK_CPU_ARM_64)
280 return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
282 uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
283 uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
284 uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3);
287 uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
288 uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
289 uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
292 uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
293 uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
296 uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
297 uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
298 uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
301 uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
302 float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
303 return assembledFloat;
312 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
316 __asm__(
"fcvtn %1.4h, %1.4s\n" \
317 "\tmov %0.8b, %1.8b" \
322 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
323 float16x4_t ret = vcvt_f16_f32(vec);
324 uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
325 ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
326 return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
327 #elif defined(AK_CPU_ARM_64)
328 return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
330 uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
331 uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
335 float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
336 uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
339 uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF);
340 uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
341 uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
342 uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31);
343 uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb);
344 uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
347 uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23);
348 uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
350 uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
353 uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000));
354 uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
355 uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec));
356 uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000));
357 uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000));
358 uint32x4_t infNanData = vorrq_u32(infData, nantissaBit);
360 uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData);
363 uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
366 uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
367 return vreinterpretq_s32_u16(resultZip.val[1]);
380 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
384 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
388 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
392 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
396 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
400 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
402 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
404 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
407 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) (AKSIMD_V4COND)(__vec__)
410 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) (AKSIMD_V4COND)(__vec__)
421 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
425 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
426 vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
429 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
431 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
432 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
434 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
435 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
437 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
441 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
442 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
443 uint32x4_t res = veorq_u32(t0, t1);
444 return vreinterpretq_f32_u32(res);
449 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
450 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
451 uint32x4_t res = vorrq_u32(t0, t1);
452 return vreinterpretq_f32_u32(res);
457 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
458 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
459 uint32x4_t res = vandq_u32(t0, t1);
460 return vreinterpretq_f32_u32(res);
465 uint32x4_t allSet = vdupq_n_u32(~0u);
466 uint32x4_t reinterpret = vreinterpretq_u32_f32(in_vec);
467 uint32x4_t result = veorq_u32(reinterpret, allSet);
468 return vreinterpretq_f32_u32(result);
471 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
472 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
474 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
485 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
486 vshlq_n_s32( (__vec__), (__shiftBy__) )
490 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
491 vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
495 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
496 vshrq_n_s32( (__vec__), (__shiftBy__) )
508 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
511 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
512 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
520 #if defined(__clang__)
521 #if defined(__has_builtin) && __has_builtin(__builtin_shufflevector)
522 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
523 __builtin_shufflevector(a, b, (zyxw >> 0) & 0x3, (zyxw >> 2) & 0x3, ((zyxw >> 4) & 0x3) + 4, ((zyxw >> 6) & 0x3) + 4 )
527 #ifndef AKSIMD_SHUFFLE_V4F32
528 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
529 _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
537 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
544 #define AKSIMD_MOVEHL_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__b__, __a__, AKSIMD_SHUFFLE(3,2,3,2))
551 #define AKSIMD_MOVELH_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__a__, __b__, AKSIMD_SHUFFLE(1,0,1,0))
554 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
557 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
560 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
563 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
566 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
577 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
581 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
586 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
587 vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
591 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
595 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
598 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
601 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
604 #define AKSIMD_MULLO_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
608 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
612 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
617 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
618 vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
622 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
626 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
633 inv = vmulq_f32(restep, inv);
634 return vmulq_f32(a, inv);
639 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
643 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
649 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
650 vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
653 #if defined(AK_CPU_ARM_64)
654 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vfmaq_f32( (__c__), (__a__), (__b__) )
655 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) vfma_f32( (__c__), (__a__), (__b__) )
656 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vfmaq_n_f32( (__c__), (__a__), (__b__) )
657 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vfma_n_f32( (__c__), (__a__), (__b__) )
659 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
660 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
661 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
662 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
667 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
668 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
678 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
682 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
686 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
690 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
693 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
696 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
697 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
700 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
703 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
706 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
709 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
729 float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2);
731 float32x4_t vC1Rev = vrev64q_f32(vCIn1);
732 float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
743 return vaddq_f32(vIn1, vIn2SignFlip);
756 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
760 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
767 float32x2_t xy = vget_low_f32( in_vec1 );
768 float32x2_t ab = vget_low_f32( in_vec2 );
769 float32x2x2_t xa_yb = vtrn_f32( xy, ab );
770 AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
779 float32x2_t zw = vget_high_f32( in_vec1 );
780 float32x2_t cd = vget_high_f32( in_vec2 );
781 float32x2x2_t zc_wd = vtrn_f32( zw, cd );
782 AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
790 int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
791 int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
792 int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
793 return vreinterpretq_s32_s16( result );
798 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
802 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
804 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
807 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
825 ret16 = vld2_lane_s16(addr0, ret16, 0);
826 ret16 = vld2_lane_s16(addr1, ret16, 1);
827 ret16 = vld2_lane_s16(addr2, ret16, 2);
828 ret16 = vld2_lane_s16(addr3, ret16, 3);
831 vmovl_s16(ret16.val[0]),
832 vmovl_s16(ret16.val[1])
858 ret16 = vld4_lane_s16(addr0, ret16, 0);
859 ret16 = vld4_lane_s16(addr1, ret16, 1);
860 ret16 = vld4_lane_s16(addr2, ret16, 2);
861 ret16 = vld4_lane_s16(addr3, ret16, 3);
864 vmovl_s16(ret16.val[0]),
865 vmovl_s16(ret16.val[1]),
866 vmovl_s16(ret16.val[2]),
867 vmovl_s16(ret16.val[3])
880 #define AKSIMD_CMP_CTRLMASK uint32x4_t
883 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
886 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
889 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
892 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
895 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
898 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
901 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
904 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
907 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
910 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
912 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
916 static const uint32x4_t movemask = vreinterpretq_u32_s32(
AKSIMD_SETV_V4I32( 8, 4, 2, 1 ));
917 static const uint32x4_t highbit = vreinterpretq_u32_s32(
AKSIMD_SETV_V4I32( 0x80000000, 0x80000000, 0x80000000, 0x80000000 ));
919 uint32x4_t t0 = in_vec1;
920 uint32x4_t t1 = vtstq_u32(t0, highbit);
921 uint32x4_t t2 = vandq_u32(t1, movemask);
922 uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
923 return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
936 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
937 uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
938 return maxValue == 0;
940 int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
941 return vget_lane_s64(orReduce, 0) == 0;
944 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
945 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
949 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
950 uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
951 return minValue == ~0;
953 int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
954 return vget_lane_s64(andReduce, 0) == ~0LL;
957 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
958 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))