33 #ifndef _AKSIMD_ARM_NEON_H_
34 #define _AKSIMD_ARM_NEON_H_
36 #if defined _MSC_VER && defined _M_ARM64
37 #include <arm64_neon.h>
41 #include <AK/SoundEngine/Common/AkTypes.h>
52 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
59 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
69 typedef int16x8_t AKSIMD_V8I16;
70 typedef int16x4_t AKSIMD_V4I16;
72 typedef uint32x2_t AKSIMD_V2UI32;
73 typedef int32x2_t AKSIMD_V2I32;
74 typedef float32_t AKSIMD_F32;
78 typedef uint32x4_t AKSIMD_V4COND;
79 typedef uint32x4_t AKSIMD_V4ICOND;
80 typedef uint32x4_t AKSIMD_V4FCOND;
82 #if defined(AK_CPU_ARM_NEON)
83 typedef float32x2x2_t AKSIMD_V2F32X2;
84 typedef float32x4x2_t AKSIMD_V4F32X2;
85 typedef float32x4x4_t AKSIMD_V4F32X4;
96 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
100 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
104 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
108 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
111 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
115 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
120 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
123 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
126 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
129 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
132 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
134 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
137 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
138 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
141 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
144 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
147 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
148 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
151 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
152 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
163 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
166 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
170 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
173 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
176 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
179 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
182 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
185 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
186 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
198 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
202 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( __vec__ )
206 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
210 #define AKSIMD_CONVERT_V2F32_TO_V2I32( __vec__ ) vcvt_s32_f32( __vec__ )
222 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
226 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
227 vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
230 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
232 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
233 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
235 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
239 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
240 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
241 uint32x4_t res = veorq_u32(t0, t1);
242 return vreinterpretq_f32_u32(res);
245 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
256 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
257 vshlq_n_s32( (__vec__), (__shiftBy__) )
261 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
262 vrshrq_n_s32( (__vec__), (__shiftBy__) )
274 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
277 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
278 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
286 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
287 _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
290 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
294 #include <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>
317 return vcombine_f32( vget_low_f32( xyzw ) , vget_low_f32( abcd ) );
322 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
326 #define AKSIMD_SHUFFLE_CDAB( __a__ ) vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
329 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
332 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
344 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
348 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
353 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
354 vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
358 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
362 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
365 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
368 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
372 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
376 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
381 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
382 vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
386 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
390 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
397 inv = vmulq_f32(restep, inv);
398 return vmulq_f32(a, inv);
403 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
407 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
413 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
414 vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
417 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
420 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) \
421 AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
424 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) \
425 AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
427 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) \
428 AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
430 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
431 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
436 return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
441 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
445 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
449 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
453 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
456 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
459 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
460 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
463 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
466 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
469 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
475 static AkForceInline
void AKSIMD_HORIZONTALADD(
AKSIMD_V4F32 & vVec )
477 AKSIMD_V4F32 vHighLow = AKSIMD_MOVEHL_V4F32(vVec, vVec);
478 vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
479 vHighLow = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0x55);
480 vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
497 vTmpa1 = vrev64_f32( vTmpa1 );
498 vTmpa1 = vmul_f32( vTmpa1, vSign );
499 vTmpa0 = vadd_f32( vTmpa0, vTmpa1 );
505 vTmpb1 = vrev64_f32( vTmpb1 );
506 vTmpb1 = vmul_f32( vTmpb1, vSign );
507 vTmpb0 = vadd_f32( vTmpb0, vTmpb1 );
509 return vcombine_f32( vTmpa0, vTmpb0 );
518 #ifdef AKSIMD_DECLARE_V4F32
519 static const AKSIMD_DECLARE_V4F32( vSign, 1.f, -1.f, 1.f, -1.f );
521 static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f };
524 AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
525 vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
526 AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
527 vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
528 vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
529 vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 );
530 vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
546 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
550 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
557 float32x2_t xy = vget_low_f32( in_vec1 );
558 float32x2_t ab = vget_low_f32( in_vec2 );
559 float32x2x2_t xa_yb = vtrn_f32( xy, ab );
560 AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
569 float32x2_t zw = vget_high_f32( in_vec1 );
570 float32x2_t cd = vget_high_f32( in_vec2 );
571 float32x2x2_t zc_wd = vtrn_f32( zw, cd );
572 AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
580 int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
581 int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
582 int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
583 return vreinterpretq_s32_s16( result );
588 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
592 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
594 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
597 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
607 #define AKSIMD_CMP_CTRLMASK uint32x4_t
610 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
613 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
616 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
619 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
622 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
625 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
628 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
631 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
634 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
637 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
639 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
641 static AkForceInline
int AKSIMD_MASK_V4F32(
const AKSIMD_V4UI32& in_vec1 )
643 #ifdef AKSIMD_DECLARE_V4F32
644 static const AKSIMD_DECLARE_V4I32(movemask, 1, 2, 4, 8);
645 static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
647 static const uint32x4_t movemask = { 1, 2, 4, 8 };
648 static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
651 uint32x4_t t0 = in_vec1;
652 uint32x4_t t1 = vtstq_u32(t0, highbit);
653 uint32x4_t t2 = vandq_u32(t1, movemask);
654 uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
655 return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
659 static AkForceInline
int AKSIMD_MASK_V4F32(
const AKSIMD_V4F32& in_vec1 )
661 return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
668 #endif //_AKSIMD_ARM_NEON_H_