32 #ifndef _AK_SIMD_AVX_H_ 
   33 #define _AK_SIMD_AVX_H_ 
   38 #if defined(AKSIMD_AVX_SUPPORTED) 
   40 #include <immintrin.h> 
   47 typedef __m256 AKSIMD_V8F32;    
 
   48 typedef __m256d AKSIMD_V4F64;   
 
   49 typedef __m256i AKSIMD_V8I32;   
 
   50 typedef AKSIMD_V8F32 AKSIMD_V8COND;  
 
   51 typedef AKSIMD_V8F32 AKSIMD_V8FCOND;     
 
   52 typedef AKSIMD_V8I32 AKSIMD_V8ICOND;
 
   65 #define AKSIMD_LOAD_V8F32( __addr__ ) _mm256_loadu_ps( (AkReal32*)(__addr__) ) 
   69 #define AKSIMD_LOAD1_V8F32( __scalar__ ) _mm256_broadcast_ss( &(__scalar__) ) 
   73 #define AKSIMD_LOAD1_V4F64( __scalar__ ) _mm256_castpd_ps(_mm256_broadcast_sd( &(__scalar__) )) 
   77 #define AKSIMD_SET_V8F32( __scalar__ ) _mm256_set1_ps( (__scalar__) ) 
   80 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) ) 
   83 #define AKSIMD_SETV_V4F64( _d, _c, _b, _a )  _mm256_castpd_ps( _mm256_set_pd( (_d), (_c), (_b), (_a) ) ) 
   87 #define AKSIMD_SETZERO_V8F32() _mm256_setzero_ps() 
   92 #define AKSIMD_LOAD_SS_V8F32( __addr__ ) _mm256_zextps128_ps256(_mm_load_ss( (__addr__) )) 
   98 #define AKSIMD_SETV_V2F128( m2, m1) _mm256_set_m128(m2, m1) 
  100 #define AKSIMD_INSERT_V2F128( a, m128, idx) _mm256_insertf128_ps(a, m128, idx) 
  102 #define AKSIMD_GETELEMENT_V8F32( __vName, __num__ )         ((AkReal32*)&(__vName))[(__num__)] 
  103 #define AKSIMD_GETELEMENT_V4F64( __vName, __num__ )         ((AkReal64*)&(__vName))[(__num__)] 
  104 #define AKSIMD_GETELEMENT_V8I32( __vName, __num__ )         ((AkInt32*)&(__vName))[(__num__)] 
  105 #define AKSIMD_GETELEMENT_V4I64( __vName, __num__ )         ((AkInt64*)&(__vName))[(__num__)] 
  118 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) ) 
  122 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) ) 
  136 #define AKSIMD_SHUFFLE_V8F32( a, b, i ) _mm256_shuffle_ps( a, b, i ) 
  139 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1)) 
  142 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2)) 
  145 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1)) 
  148 #define AKSIMD_DUP_V8_ODD(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1)) 
  151 #define AKSIMD_DUP_V8_EVEN(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0)) 
  154 #define AKSIMD_SHUFFLE_V8I32( a, b, i ) _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), i ))  
  157 #define AKSIMD_PERMUTEVAR_V8F32(a, b) _mm256_permutevar_ps(a, b) 
  160 #define AKSIMD_PERMUTE128( l1, l0 ) (((l1) << 4) | (l0)) 
  164 #define AKSIMD_PERMUTE_2X128_V8F32( a, b, i ) _mm256_permute2f128_ps(a, b, i) 
  167 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0)) 
  170 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1)) 
  173 #define AKSIMD_EXTRACT_V2F128( a, i ) _mm256_extractf128_ps(a, i) 
  180 static AkForceInline void AKSIMD_TRANSPOSE8X4_V8F32(AKSIMD_V8F32& A, AKSIMD_V8F32& B, AKSIMD_V8F32& C, AKSIMD_V8F32& D)
 
  182     AKSIMD_V8F32 tmp1, tmp2, tmp3, tmp4;
 
  204 #define AKSIMD_SUB_V8F32( a, b ) _mm256_sub_ps( a, b ) 
  209 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) ) 
  213 #define AKSIMD_ADD_V8F32( a, b ) _mm256_add_ps( a, b ) 
  217 #define AKSIMD_ADDSUB_V8F32( a, b ) _mm256_addsub_ps( a, b ) 
  222 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) ) 
  226 #define AKSIMD_MUL_V8F32( a, b ) _mm256_mul_ps( a, b ) 
  228 #define AKSIMD_DIV_V8F32( a, b ) _mm256_div_ps( a, b ) 
  234 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) ) 
  238 #define AKSIMD_MIN_V8F32( a, b ) _mm256_min_ps( a, b ) 
  242 #define AKSIMD_MAX_V8F32( a, b ) _mm256_max_ps( a, b ) 
  245 #define AKSIMD_ABS_V8F32( a ) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a) 
  248 #define AKSIMD_NEG_V8F32( __a__ ) _mm256_xor_ps(_mm256_set1_ps(-0.f), __a__) 
  251 #define AKSIMD_SQRT_V8F32( __a__ ) _mm256_sqrt_ps( (__a__) ) 
  254 #define AKSIMD_RSQRT_V8F32( __a__ ) _mm256_rsqrt_ps( (__a__) ) 
  257 #define AKSIMD_RECIP_V8F32( __a__ ) _mm256_rcp_ps( (__a__) ) 
  260 #define AKSIMD_CEIL_V8F32( __a__ ) _mm256_ceil_ps( (__a__) ) 
  262 #define AKSIMD_XOR_V8F32( a, b ) _mm256_xor_ps(a,b) 
  263 #define AKSIMD_OR_V8F32( a, b ) _mm256_or_ps(a,b) 
  264 #define AKSIMD_AND_V8F32( a, b) _mm256_and_ps(a,b) 
  265 #define AKSIMD_NOT_V8F32( a ) _mm256_xor_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(~0))) 
  266 #define AKSIMD_ANDNOT_V8F32( a, b ) _mm256_andnot_ps(a, b) 
  272 static AkForceInline AKSIMD_V8F32 AKSIMD_HORIZONTALADD_V8F32(AKSIMD_V8F32 vVec)
 
  274     __m256 vAb = _mm256_shuffle_ps(vVec, vVec, 0xB1);
 
  275     __m256 vHaddAb = _mm256_add_ps(vVec, vAb);
 
  276     __m256 vHaddCd = _mm256_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
 
  277     __m256 vHaddAbcd = _mm256_add_ps(vHaddAb, vHaddCd);
 
  278     __m256 vHaddEfgh = _mm256_permute2f128_ps(vHaddAbcd, vHaddAbcd, 0x01);
 
  279     __m256 vHaddAll = _mm256_add_ps(vHaddAbcd, vHaddEfgh);
 
  284 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(
const AKSIMD_V8F32 cIn1, 
const AKSIMD_V8F32 cIn2)
 
  286     __m256 real1Ext = _mm256_moveldup_ps(cIn1);             
 
  287     __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1);       
 
  288     __m256 imag1Ext = _mm256_movehdup_ps(cIn1);             
 
  289     __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf);             
 
  290     __m256 mul = _mm256_mul_ps(real1Ext, cIn2); 
 
  291     __m256 out = _mm256_addsub_ps(mul, temp); 
 
  306 #define AKSIMD_UNPACKLO_V8F32( a, b ) _mm256_unpacklo_ps( a, b ) 
  311 #define AKSIMD_UNPACKHI_V8F32( a, b ) _mm256_unpackhi_ps( a, b ) 
  320 #define AKSIMD_CMP_CTRLMASKV8 __m256 
  323 #define AKSIMD_LTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LE_OS ) 
  325 #define AKSIMD_LT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LT_OS ) 
  328 #define AKSIMD_GTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GE_OS ) 
  330 #define AKSIMD_GT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GT_OS ) 
  333 #define AKSIMD_EQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_EQ_OS ) 
  336 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
 
  338     return _mm256_blendv_ps(vA, vB, vMask);
 
  342 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) ) 
  345 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) ) 
  347 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx)) 
  349 #define AKSIMD_MASK_V8F32( __a__ ) _mm256_movemask_ps( __a__ ) 
  352 #define AKSIMD_TESTZERO_V8I32( __a__ ) (_mm256_testz_si256(__a__,__a__) != 0) 
  353 #define AKSIMD_TESTZERO_V8F32( __a__) AKSIMD_TESTZERO_V8I32(_mm256_castps_si256(__a__)) 
  356 #define AKSIMD_TESTONES_V8I32(__a__) (_mm256_testc_si256(__a__, _mm256_set1_epi32(~0)) != 0) 
  357 #define AKSIMD_TESTONES_V8F32( __a__) AKSIMD_TESTONES_V8I32(_mm256_castps_si256(__a__)) 
  364 #define AKSIMD_LOAD_V8I32( __addr__ ) _mm256_loadu_si256( (__addr__) ) 
  367 #define AKSIMD_SETZERO_V8I32() _mm256_setzero_si256() 
  370 #define AKSIMD_SET_V8I32( __scalar__ ) _mm256_set1_epi32( (__scalar__) ) 
  373 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) ) 
  379 #define AKSIMD_SET_V2I128(m1, m2) _mm256_setr_m128i(m1, m2) 
  384 #define AKSIMD_STORE_V8I32( __addr__, __vec__ ) _mm256_storeu_si256( (__addr__), (__vec__) ) 
  392 #define AKSIMD_CONVERT_V8I32_TO_V8F32( __vec__ ) _mm256_cvtepi32_ps( (__vec__) ) 
  396 #define AKSIMD_ROUND_V8F32_TO_V8I32( __vec__ ) _mm256_cvtps_epi32( (__vec__) ) 
  400 #define AKSIMD_TRUNCATE_V8F32_TO_V8I32( __vec__ ) _mm256_cvttps_epi32( (__vec__) ) 
  405 #define AKSIMD_CONVERT_V8F16_TO_V8F32( __vec__ ) _mm256_cvtph_ps( (__vec__) ) 
  410 #define AKSIMD_CONVERT_V8F32_TO_V8F16( __vec__ ) _mm256_cvtps_ph(__vec__, (_MM_FROUND_TO_NEAREST_INT ) ) 
  421 #define AKSIMD_CAST_V4F64_TO_V8F32( __vec__ ) _mm256_castpd_ps(__vec__) 
  425 #define AKSIMD_CAST_V4F64_TO_V8I32( __vec__ ) _mm256_castpd_si256(__vec__) 
  429 #define AKSIMD_CAST_V8F32_TO_V4F64( __vec__ ) _mm256_castps_pd(__vec__) 
  433 #define AKSIMD_CAST_V8F32_TO_V8I32( __vec__ ) _mm256_castps_si256(__vec__) 
  437 #define AKSIMD_CAST_V8I32_TO_V4F64( __vec__ ) _mm256_castsi256_pd(__vec__) 
  441 #define AKSIMD_CAST_V8I32_TO_V8F32( __vec__ ) _mm256_castsi256_ps(__vec__) 
  444 #define AKSIMD_CAST_V8COND_TO_V8F32( __vec__ ) (__vec__) 
  447 #define AKSIMD_CAST_V8F32_TO_V8COND( __vec__ ) (__vec__) 
  450 #define AKSIMD_CAST_V8COND_TO_V8I32( __vec__ )  _mm256_castps_si256(__vec__) 
  453 #define AKSIMD_CAST_V8I32_TO_V8COND( __vec__ ) _mm256_castsi256_ps(__vec__) 
  457 #endif //_AK_SIMD_AVX_H_