目次

include/AK/SoundEngine/Platforms/arm_neon/AkSimd.h

説明を見る。
00001 
00002 //
00003 // Copyright (c) 2006 Audiokinetic Inc. / All Rights Reserved
00004 //
00006 
00007 // AkSimd.h
00008 
00011 
00012 #ifndef _AKSIMD_ARM_NEON_H_
00013 #define _AKSIMD_ARM_NEON_H_
00014 
00015 #include <arm_neon.h>
00016 #include <AK/SoundEngine/Common/AkTypes.h>
00017 
00018 // Platform specific defines for prefetching
00019 
00020 /*
00021 // ??????
00022 #define AKSIMD_ARCHCACHELINESIZE    (64)                ///< Assumed cache line width for architectures on this platform
00023 // ??????
00024 #define AKSIMD_ARCHMAXPREFETCHSIZE  (512)               ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)       
00026 // ??????
00027 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA ) 
00028 */
00029 
00032 
00033 
00034 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00035 
00037 
00038 
00041 
00042 
00043 typedef int32x4_t       AKSIMD_V4I32;       
00044 typedef int16x8_t       AKSIMD_V8I16;       
00045 typedef int16x4_t       AKSIMD_V4I16;       
00046 typedef uint32x4_t      AKSIMD_V4UI32;      
00047 typedef uint32x2_t      AKSIMD_V2UI32;      
00048 typedef int32x2_t       AKSIMD_V2I32;       
00049 typedef float32_t       AKSIMD_F32;         
00050 typedef float32x2_t     AKSIMD_V2F32;       
00051 typedef float32x4_t     AKSIMD_V4F32;       
00052 
00053 typedef uint32x4_t      AKSIMD_V4COND;      
00054 typedef uint32x4_t      AKSIMD_V4ICOND;     
00055 typedef uint32x4_t      AKSIMD_V4FCOND;     
00056 
00057 #if defined(AK_CPU_ARM_NEON)
00058 typedef float32x2x2_t   AKSIMD_V2F32X2;
00059 typedef float32x4x2_t   AKSIMD_V4F32X2;
00060 typedef float32x4x4_t   AKSIMD_V4F32X4;
00061 #endif
00062 
00063 #define AKSIMD_V4F32_SUPPORTED
00064 
00065 
00067 
00068 
00071 
00072 
00074 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
00075 
00078 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
00079 
00082 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
00083 
00086 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
00087 
00089 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
00090 
00093 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
00094 
00098 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
00099 
00101 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
00102 
00104 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
00105 
00107 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
00108 
00110 #if defined(AK_VITA) 
00111 // Due to a compiler bug in sony sdk 1.8 and 2.0, this workaround is required. Removed when fixed.
00112 #define AKSIMD_LOADU_V4I32( __addr__ ) *__addr__ 
00113 #else
00114 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
00115 #endif
00116 
00117 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
00118 
00120 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
00121 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
00122 
00124 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
00125 
00127 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
00128 
00130 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
00131 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
00132 
00134 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
00135 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
00136 
00138 
00139 
00140 
00143 
00144 
00146 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
00147 
00149 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
00150 
00153 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
00154 
00156 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
00157 
00159 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
00160 
00162 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
00163 
00165 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
00166 
00168 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
00169 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
00170 
00172 
00173 
00174 
00177 
00178 
00181 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
00182 
00185 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( __vec__ )
00186 
00189 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
00190 
00193 #define AKSIMD_CONVERT_V2F32_TO_V2I32( __vec__ ) vcvt_s32_f32( __vec__ )
00194 
00196 
00197 
00198 
00201 
00202 
00205 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
00206 
00209 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
00210     vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
00211 
00213 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
00214 
00216 
00217 
00218 
00221 
00222 
00225 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
00226     vshlq_n_s32( (__vec__), (__shiftBy__) )
00227 
00230 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
00231     vrshrq_n_s32( (__vec__), (__shiftBy__) )
00232 
00234 
00235 
00236 
00239 
00240 
00241 // Macro for combining two vector of 2 elements into one vector of
00242 // 4 elements.
00243 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
00244 
00245 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
00246 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
00247     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
00248 
00251 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
00252 // If you get a link error, it's probably because the required
00253 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
00254 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
00255 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
00256     _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
00257 
00258 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
00259 // implemented in a separate header file to keep this one cleaner:
00260 #include <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>
00261 
00267 inline AKSIMD_V4F32 AKSIMD_MOVEHL_V4F32( const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw ) 
00268 {
00269         //return akshuffle_zwcd( xyzw, abcd );
00270         AKSIMD_V2F32 zw = vget_high_f32( xyzw );
00271         AKSIMD_V2F32 cd = vget_high_f32( abcd );
00272         AKSIMD_V4F32 zwcd = vcombine_f32( zw , cd );
00273         return zwcd;
00274 }
00275 
00281 inline AKSIMD_V4F32 AKSIMD_MOVELH_V4F32( const AKSIMD_V4F32& xyzw, const AKSIMD_V4F32& abcd )
00282 {
00283     return vcombine_f32( vget_low_f32( xyzw ) , vget_low_f32( abcd ) );
00284 }
00285 
00287 //#define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
00288 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
00289 
00291 //#define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
00292 #define AKSIMD_SHUFFLE_CDAB( __a__ ) vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
00293 
00295 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00296 
00298 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00299 
00301 
00302 
00303 
00306 
00307 
00310 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
00311 
00314 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
00315 
00319 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
00320     vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
00321 
00324 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
00325 
00328 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
00329 
00331 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
00332 
00335 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
00336 
00339 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
00340 
00344 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
00345     vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
00346 
00349 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
00350 
00353 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
00354 
00356 AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32( AKSIMD_V4F32 a, AKSIMD_V4F32 b ) 
00357 {
00358     AKSIMD_V4F32 inv = vrecpeq_f32(b);
00359     AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
00360     inv = vmulq_f32(restep, inv);
00361     return vmulq_f32(a, inv);
00362 }
00363 
00366 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
00367 
00370 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
00371 
00376 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
00377     vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
00378 
00380 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) \
00381     AKSIMD_ADD_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
00382 
00383 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) \
00384     AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
00385 
00386 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) \
00387     AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
00388 
00389 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) \
00390     AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
00391 
00392 #define AKSIMD_MADD_V4F32_INST( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
00393 #define AKSIMD_MADD_V2F32_INST( __a__, __b__, __c__ ) vmla_f32( (__c__), (__a__), (__b__) )
00394 //#define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) vmlsq_f32( (__c__), (__a__), (__b__) )
00395 //#define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) vmls_f32( (__c__), (__a__), (__b__) )
00396 
00397 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
00398 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
00399 
00401 AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32( const AKSIMD_V4F32& __a__, const AKSIMD_V4F32& __b__, const AKSIMD_V4F32& __c__ )
00402 {
00403     return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
00404 }
00405 
00408 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
00409 
00412 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
00413 
00416 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
00417 
00420 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
00421 
00423 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
00424 
00426 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
00427 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
00428 
00430 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
00431 
00433 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
00434 
00439 static AkForceInline void AKSIMD_HORIZONTALADD( AKSIMD_V4F32 & vVec )
00440 {   
00441     AKSIMD_V4F32 vHighLow = AKSIMD_MOVEHL_V4F32(vVec, vVec);
00442     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00443     vHighLow = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0x55);
00444     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00445 } 
00446 
00448 
00449 #if defined(AK_IOS) || defined(AK_VITA)
00450 
00451 // V2 implementation (faster 'cause ARM processors actually have an x2 pipeline)
00452 
00453 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2 )
00454 {
00455     static const AKSIMD_V2F32 vSign = { -1.f, 1.f }; 
00456 
00457     AKSIMD_V2F32 vCIn1a = vget_low_f32( vCIn1 );
00458     AKSIMD_V2F32 vCIn2a = vget_low_f32( vCIn2 );
00459     AKSIMD_V2F32 vTmpa0 = vmul_n_f32( vCIn2a, vCIn1a[0] );
00460     AKSIMD_V2F32 vTmpa1 = vmul_n_f32( vCIn2a, vCIn1a[1] );
00461     vTmpa1 = vrev64_f32( vTmpa1 );
00462     vTmpa1 = vmul_f32( vTmpa1, vSign );
00463     vTmpa0 = vadd_f32( vTmpa0, vTmpa1 );
00464 
00465     AKSIMD_V2F32 vCIn1b = vget_high_f32( vCIn1 );
00466     AKSIMD_V2F32 vCIn2b = vget_high_f32( vCIn2 );
00467     AKSIMD_V2F32 vTmpb0 = vmul_n_f32( vCIn2b, vCIn1b[0] );
00468     AKSIMD_V2F32 vTmpb1 = vmul_n_f32( vCIn2b, vCIn1b[1] );
00469     vTmpb1 = vrev64_f32( vTmpb1 );
00470     vTmpb1 = vmul_f32( vTmpb1, vSign );
00471     vTmpb0 = vadd_f32( vTmpb0, vTmpb1 );
00472 
00473     return vcombine_f32( vTmpa0, vTmpb0 );
00474 }
00475 
00476 #else
00477 
00478 // V4 implementation (kept in case future ARM processors actually have an x4 pipeline)
00479 
00480 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2 )
00481 {
00482 #ifdef AKSIMD_DECLARE_V4F32
00483     static const AKSIMD_DECLARE_V4F32( vSign, 1.f, -1.f, 1.f, -1.f ); 
00484 #else
00485     static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f }; 
00486 #endif
00487 
00488     AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0)); 
00489     vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00490     AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1)); 
00491     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00492     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
00493     vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 ); 
00494     vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
00495     return vTmp2;
00496 }
00497 
00498 #endif
00499 
00501 
00502 
00503 
00506 
00507 
00510 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
00511 
00514 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
00515 
00518 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00519 {
00520     // sce_vectormath_xayb(in_vec1, in_vec2)
00521     float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
00522     float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
00523     float32x2x2_t xa_yb = vtrn_f32( xy, ab );
00524     AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
00525     return xayb;
00526 }
00527 
00530 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00531 {
00532     //return sce_vectormath_zcwd( in_vec1, in_vec2 );
00533     float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
00534     float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
00535     float32x2x2_t zc_wd = vtrn_f32( zw, cd );
00536     AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
00537     return zcwd;
00538 }
00539 
00542 AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00543 {
00544     int16x4_t   vec1_16 = vqmovn_s32( in_vec1 );
00545     int16x4_t   vec2_16 = vqmovn_s32( in_vec2 );
00546     int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
00547     return vreinterpretq_s32_s16( result );
00548 }
00549 
00552 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
00553 
00556 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
00557 
00558 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
00559 
00561 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
00562 
00564 
00565 
00569 
00570 
00571 #define AKSIMD_CMP_CTRLMASK uint32x4_t
00572 
00574 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
00575 
00577 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
00578 
00580 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
00581 
00583 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
00584 
00586 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
00587 
00588 // (cond1 >= cond2) ? b : a.
00589 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
00590 
00591 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
00592 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, AKSIMD_SETZERO_V4F32() ) )
00593 
00594 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
00595 
00597 
00598 
00599 #endif //_AKSIMD_ARM_NEON_H_
00600