Table of Contents

include/AK/SoundEngine/Platforms/arm_neon/AkSimd.h

Go to the documentation of this file.
00001 /*******************************************************************************
00002 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
00003 released in source code form as part of the SDK installer package.
00004 
00005 Commercial License Usage
00006 
00007 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
00008 may use this file in accordance with the end user license agreement provided 
00009 with the software or, alternatively, in accordance with the terms contained in a
00010 written agreement between you and Audiokinetic Inc.
00011 
00012 Apache License Usage
00013 
00014 Alternatively, this file may be used under the Apache License, Version 2.0 (the 
00015 "Apache License"); you may not use this file except in compliance with the 
00016 Apache License. You may obtain a copy of the Apache License at 
00017 http://www.apache.org/licenses/LICENSE-2.0.
00018 
00019 Unless required by applicable law or agreed to in writing, software distributed
00020 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
00021 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
00022 the specific language governing permissions and limitations under the License.
00023 
00024   Version: <VERSION>  Build: <BUILDNUMBER>
00025   Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
00026 *******************************************************************************/
00027 
00028 // AkSimd.h
00029 
00030 /// \file 
00031 /// AKSIMD - arm_neon implementation
00032 
00033 #ifndef _AKSIMD_ARM_NEON_H_
00034 #define _AKSIMD_ARM_NEON_H_
00035 
00036 #if defined _MSC_VER && defined _M_ARM64
00037     #include <arm64_neon.h>
00038 #else
00039     #include <arm_neon.h>
00040 #endif
00041 #include <AK/SoundEngine/Common/AkTypes.h>
00042 
00043 // Platform specific defines for prefetching
00044 
00045 /*
00046 // ??????
00047 #define AKSIMD_ARCHCACHELINESIZE    (64)                ///< Assumed cache line width for architectures on this platform
00048 // ??????
00049 #define AKSIMD_ARCHMAXPREFETCHSIZE  (512)               ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)       
00050 /// Cross-platform memory prefetch of effective address assuming non-temporal data
00051 // ??????
00052 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA ) 
00053 */
00054 
00055 ////////////////////////////////////////////////////////////////////////
00056 /// @name Platform specific memory size alignment for allocation purposes
00057 //@{
00058 
00059 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00060 
00061 //@}
00062 ////////////////////////////////////////////////////////////////////////
00063 
00064 ////////////////////////////////////////////////////////////////////////
00065 /// @name AKSIMD types
00066 //@{
00067 
00068 typedef int32x4_t       AKSIMD_V4I32;       ///< Vector of 4 32-bit signed integers
00069 typedef int16x8_t       AKSIMD_V8I16;       ///< Vector of 8 16-bit signed integers
00070 typedef int16x4_t       AKSIMD_V4I16;       ///< Vector of 4 16-bit signed integers
00071 typedef uint32x4_t      AKSIMD_V4UI32;      ///< Vector of 4 32-bit unsigned signed integers
00072 typedef uint32x2_t      AKSIMD_V2UI32;      ///< Vector of 2 32-bit unsigned signed integers
00073 typedef int32x2_t       AKSIMD_V2I32;       ///< Vector of 2 32-bit signed integers
00074 typedef float32_t       AKSIMD_F32;         ///< 32-bit float
00075 typedef float32x2_t     AKSIMD_V2F32;       ///< Vector of 2 32-bit floats
00076 typedef float32x4_t     AKSIMD_V4F32;       ///< Vector of 4 32-bit floats
00077 
00078 typedef uint32x4_t      AKSIMD_V4COND;      ///< Vector of 4 comparison results
00079 typedef uint32x4_t      AKSIMD_V4ICOND;     ///< Vector of 4 comparison results
00080 typedef uint32x4_t      AKSIMD_V4FCOND;     ///< Vector of 4 comparison results
00081 
00082 #if defined(AK_CPU_ARM_NEON)
00083 typedef float32x2x2_t   AKSIMD_V2F32X2;
00084 typedef float32x4x2_t   AKSIMD_V4F32X2;
00085 typedef float32x4x4_t   AKSIMD_V4F32X4;
00086 #endif
00087 
00088 //@}
00089 ////////////////////////////////////////////////////////////////////////
00090 
00091 ////////////////////////////////////////////////////////////////////////
00092 /// @name AKSIMD loading / setting
00093 //@{
00094 
00095 /// Loads four single-precision, floating-point values (see _mm_load_ps)
00096 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
00097 
00098 /// Loads four single-precision floating-point values from unaligned
00099 /// memory (see _mm_loadu_ps)
00100 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
00101 
00102 /// Loads a single single-precision, floating-point value, copying it into
00103 /// all four words (see _mm_load1_ps, _mm_load_ps1)
00104 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
00105 
00106 /// Sets the four single-precision, floating-point values to __scalar__ (see
00107 /// _mm_set1_ps, _mm_set_ps1)
00108 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
00109 
00110 /// Sets the four integer values to __scalar__
00111 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
00112 
00113 /// Sets the four single-precision, floating-point values to zero (see
00114 /// _mm_setzero_ps)
00115 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
00116 
00117 /// Loads a single-precision, floating-point value into the low word
00118 /// and clears the upper three words.
00119 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
00120 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
00121 
00122 /// Loads four 32-bit signed integer values (aligned)
00123 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
00124 
00125 /// Loads 8 16-bit signed integer values (aligned)
00126 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
00127 
00128 /// Loads 4 16-bit signed integer values (aligned)
00129 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
00130 
00131 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
00132 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
00133 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
00134 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
00135 
00136 /// Loads two single-precision, floating-point values
00137 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
00138 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
00139 
00140 /// Sets the two single-precision, floating-point values to __scalar__
00141 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
00142 
00143 /// Sets the two single-precision, floating-point values to zero
00144 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
00145 
00146 /// Loads data from memory and de-interleaves
00147 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
00148 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
00149 
00150 /// Loads data from memory and de-interleaves; only selected lane
00151 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
00152 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
00153 
00154 //@}
00155 ////////////////////////////////////////////////////////////////////////
00156 
00157 
00158 ////////////////////////////////////////////////////////////////////////
00159 /// @name AKSIMD storing
00160 //@{
00161 
00162 /// Stores four single-precision, floating-point values. The address must be 16-byte aligned
00163 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
00164 
00165 /// Stores four single-precision, floating-point values. The address does not need to be 16-byte aligned.
00166 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
00167 
00168 /// Stores the lower single-precision, floating-point value.
00169 /// *p := a0 (see _mm_store_ss)
00170 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
00171 
00172 /// Stores four 32-bit integer values. The address must be 16-byte aligned.
00173 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
00174 
00175 /// Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
00176 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
00177 
00178 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
00179 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
00180 
00181 /// Stores two single-precision, floating-point values. The address must be 16-byte aligned.
00182 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
00183 
00184 /// Stores data by interleaving into memory
00185 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
00186 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
00187 
00188 //@}
00189 ////////////////////////////////////////////////////////////////////////
00190 
00191 
00192 ////////////////////////////////////////////////////////////////////////
00193 /// @name AKSIMD conversion
00194 //@{
00195 
00196 /// Converts the four signed 32-bit integer values of a to single-precision,
00197 /// floating-point values (see _mm_cvtepi32_ps)
00198 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
00199 
00200 /// Converts the four single-precision, floating-point values of a to signed
00201 /// 32-bit integer values (see _mm_cvtps_epi32)
00202 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( __vec__ )
00203 
00204 /// Converts the four single-precision, floating-point values of a to signed
00205 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
00206 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
00207 
00208 /// Converts the two single-precision, floating-point values of a to signed
00209 /// 32-bit integer values
00210 #define AKSIMD_CONVERT_V2F32_TO_V2I32( __vec__ ) vcvt_s32_f32( __vec__ )
00211 
00212 //@}
00213 ////////////////////////////////////////////////////////////////////////
00214 
00215 
00216 ////////////////////////////////////////////////////////////////////////
00217 /// @name AKSIMD logical operations
00218 //@{
00219 
00220 /// Computes the bitwise AND of the 128-bit value in a and the
00221 /// 128-bit value in b (see _mm_and_si128)
00222 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
00223 
00224 /// Compares the 8 signed 16-bit integers in a and the 8 signed
00225 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
00226 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
00227     vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
00228 
00229 /// Compares for less than or equal (see _mm_cmple_ps)
00230 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
00231 
00232 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
00233 #define AKSIMD_CMPGT_V4I32( __a__, __b__)  vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
00234 
00235 #define AKSIMD_XOR_V4I32(__a__, __b__)  veorq_s32(__a__, __b__)
00236 
00237 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
00238 {
00239     uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
00240     uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
00241     uint32x4_t res = veorq_u32(t0, t1);
00242     return vreinterpretq_f32_u32(res);
00243 }
00244 
00245 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
00246 //@}
00247 ////////////////////////////////////////////////////////////////////////
00248 
00249 
00250 ////////////////////////////////////////////////////////////////////////
00251 /// @name AKSIMD shifting
00252 //@{
00253 
00254 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
00255 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
00256 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
00257     vshlq_n_s32( (__vec__), (__shiftBy__) )
00258 
00259 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
00260 /// bits while shifting in the sign bit (see _mm_srai_epi32)
00261 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
00262     vrshrq_n_s32( (__vec__), (__shiftBy__) )
00263 
00264 //@}
00265 ////////////////////////////////////////////////////////////////////////
00266 
00267 
00268 ////////////////////////////////////////////////////////////////////////
00269 /// @name AKSIMD shuffling
00270 //@{
00271 
00272 // Macro for combining two vector of 2 elements into one vector of
00273 // 4 elements.
00274 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
00275 
00276 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
00277 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
00278     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
00279 
00280 /// Selects four specific single-precision, floating-point values from
00281 /// a and b, based on the mask i (see _mm_shuffle_ps)
00282 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
00283 // If you get a link error, it's probably because the required
00284 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
00285 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
00286 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
00287     _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
00288 
00289 /// Barrel-shift all floats by one.
00290 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
00291     
00292 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
00293 // implemented in a separate header file to keep this one cleaner:
00294 #include <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>
00295 
00296 /// Moves the upper two single-precision, floating-point values of b to
00297 /// the lower two single-precision, floating-point values of the result.
00298 /// The upper two single-precision, floating-point values of a are passed
00299 /// through to the result.
00300 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
00301 inline AKSIMD_V4F32 AKSIMD_MOVEHL_V4F32( const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw ) 
00302 {
00303         //return akshuffle_zwcd( xyzw, abcd );
00304         AKSIMD_V2F32 zw = vget_high_f32( xyzw );
00305         AKSIMD_V2F32 cd = vget_high_f32( abcd );
00306         AKSIMD_V4F32 zwcd = vcombine_f32( zw , cd );
00307         return zwcd;
00308 }
00309 
00310 /// Moves the lower two single-precision, floating-point values of b to
00311 /// the upper two single-precision, floating-point values of the result.
00312 /// The lower two single-precision, floating-point values of a are passed
00313 /// through to the result.
00314 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
00315 inline AKSIMD_V4F32 AKSIMD_MOVELH_V4F32( const AKSIMD_V4F32& xyzw, const AKSIMD_V4F32& abcd )
00316 {
00317     return vcombine_f32( vget_low_f32( xyzw ) , vget_low_f32( abcd ) );
00318 }
00319 
00320 /// Swap the 2 lower floats together and the 2 higher floats together.  
00321 //#define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
00322 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
00323 
00324 /// Swap the 2 lower floats with the 2 higher floats.   
00325 //#define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
00326 #define AKSIMD_SHUFFLE_CDAB( __a__ ) vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
00327 
00328 /// Duplicates the odd items into the even items (d c b a -> d d b b )
00329 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00330 
00331 /// Duplicates the even items into the odd items (d c b a -> c c a a )
00332 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00333 
00334 //@}
00335 ////////////////////////////////////////////////////////////////////////
00336 
00337 
00338 ////////////////////////////////////////////////////////////////////////
00339 /// @name AKSIMD arithmetic
00340 //@{
00341 
00342 /// Subtracts the four single-precision, floating-point values of
00343 /// a and b (a - b) (see _mm_sub_ps)
00344 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
00345 
00346 /// Subtracts the two single-precision, floating-point values of
00347 /// a and b
00348 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
00349 
00350 /// Subtracts the lower single-precision, floating-point values of a and b.
00351 /// The upper three single-precision, floating-point values are passed through from a.
00352 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
00353 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
00354     vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
00355 
00356 /// Adds the four single-precision, floating-point values of
00357 /// a and b (see _mm_add_ps)
00358 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
00359 
00360 /// Adds the two single-precision, floating-point values of
00361 /// a and b
00362 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
00363 
00364 /// Adds the four integers of a and b
00365 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
00366 
00367 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
00368 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
00369 
00370 /// Compare the content of four single-precision, floating-point values of
00371 /// a and b
00372 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
00373 
00374 /// Compare the content of two single-precision, floating-point values of
00375 /// a and b
00376 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
00377 
00378 /// Adds the lower single-precision, floating-point values of a and b; the
00379 /// upper three single-precision, floating-point values are passed through from a.
00380 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
00381 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
00382     vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
00383 
00384 /// Multiplies the four single-precision, floating-point values
00385 /// of a and b (see _mm_mul_ps)
00386 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
00387 
00388 /// Multiplies the four single-precision, floating-point values of a
00389 /// by the single-precision, floating-point scalar b
00390 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
00391 
00392 /// Rough estimation of division
00393 AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32( AKSIMD_V4F32 a, AKSIMD_V4F32 b ) 
00394 {
00395     AKSIMD_V4F32 inv = vrecpeq_f32(b);
00396     AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
00397     inv = vmulq_f32(restep, inv);
00398     return vmulq_f32(a, inv);
00399 }
00400 
00401 /// Multiplies the two single-precision, floating-point values
00402 /// of a and b
00403 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
00404 
00405 /// Multiplies the two single-precision, floating-point values of a
00406 /// by the single-precision, floating-point scalar b
00407 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
00408 
00409 /// Multiplies the lower single-precision, floating-point values of
00410 /// a and b; the upper three single-precision, floating-point values
00411 /// are passed through from a.
00412 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
00413 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
00414     vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
00415 
00416 /// Vector multiply-add operation.
00417 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
00418 
00419 /// Vector multiply-substract operation.  Careful: vmlsq_f32 does c-(a*b) and not the expected (a*b)-c
00420 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) \
00421     AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
00422 
00423 
00424 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) \
00425     AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
00426 
00427 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) \
00428     AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
00429 
00430 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
00431 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
00432 
00433 /// Vector multiply-add operation.
00434 AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32( const AKSIMD_V4F32& __a__, const AKSIMD_V4F32& __b__, const AKSIMD_V4F32& __c__ )
00435 {
00436     return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
00437 }
00438 
00439 /// Computes the minima of the four single-precision, floating-point
00440 /// values of a and b (see _mm_min_ps)
00441 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
00442 
00443 /// Computes the minima of the two single-precision, floating-point
00444 /// values of a and b
00445 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
00446 
00447 /// Computes the maximums of the four single-precision, floating-point
00448 /// values of a and b (see _mm_max_ps)
00449 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
00450 
00451 /// Computes the maximums of the two single-precision, floating-point
00452 /// values of a and b
00453 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
00454 
00455 /// Returns absolute value
00456 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
00457 
00458 /// Changes the sign
00459 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
00460 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
00461 
00462 /// Square root (4 floats)
00463 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
00464 
00465 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
00466 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
00467 
00468 /// Square root (2 floats)
00469 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
00470 
00471 /// Faked in-place vector horizontal add. 
00472 /// \akwarning
00473 /// Don't expect this to be very efficient. 
00474 /// \endakwarning
00475 static AkForceInline void AKSIMD_HORIZONTALADD( AKSIMD_V4F32 & vVec )
00476 {   
00477     AKSIMD_V4F32 vHighLow = AKSIMD_MOVEHL_V4F32(vVec, vVec);
00478     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00479     vHighLow = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0x55);
00480     vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
00481 } 
00482 
00483 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
00484 
00485 #if defined(AK_IOS)
00486 
00487 // V2 implementation (faster 'cause ARM processors actually have an x2 pipeline)
00488 
00489 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2 )
00490 {
00491     static const AKSIMD_V2F32 vSign = { -1.f, 1.f }; 
00492 
00493     AKSIMD_V2F32 vCIn1a = vget_low_f32( vCIn1 );
00494     AKSIMD_V2F32 vCIn2a = vget_low_f32( vCIn2 );
00495     AKSIMD_V2F32 vTmpa0 = vmul_n_f32( vCIn2a, vCIn1a[0] );
00496     AKSIMD_V2F32 vTmpa1 = vmul_n_f32( vCIn2a, vCIn1a[1] );
00497     vTmpa1 = vrev64_f32( vTmpa1 );
00498     vTmpa1 = vmul_f32( vTmpa1, vSign );
00499     vTmpa0 = vadd_f32( vTmpa0, vTmpa1 );
00500 
00501     AKSIMD_V2F32 vCIn1b = vget_high_f32( vCIn1 );
00502     AKSIMD_V2F32 vCIn2b = vget_high_f32( vCIn2 );
00503     AKSIMD_V2F32 vTmpb0 = vmul_n_f32( vCIn2b, vCIn1b[0] );
00504     AKSIMD_V2F32 vTmpb1 = vmul_n_f32( vCIn2b, vCIn1b[1] );
00505     vTmpb1 = vrev64_f32( vTmpb1 );
00506     vTmpb1 = vmul_f32( vTmpb1, vSign );
00507     vTmpb0 = vadd_f32( vTmpb0, vTmpb1 );
00508 
00509     return vcombine_f32( vTmpa0, vTmpb0 );
00510 }
00511 
00512 #else
00513 
00514 // V4 implementation (kept in case future ARM processors actually have an x4 pipeline)
00515 
00516 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2 )
00517 {
00518 #ifdef AKSIMD_DECLARE_V4F32
00519     static const AKSIMD_DECLARE_V4F32( vSign, 1.f, -1.f, 1.f, -1.f ); 
00520 #else
00521     static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f }; 
00522 #endif
00523 
00524     AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0)); 
00525     vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00526     AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1)); 
00527     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00528     vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
00529     vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 ); 
00530     vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
00531     return vTmp2;
00532 }
00533 
00534 #endif
00535 
00536 //@}
00537 ////////////////////////////////////////////////////////////////////////
00538 
00539 
00540 ////////////////////////////////////////////////////////////////////////
00541 /// @name AKSIMD packing / unpacking
00542 //@{
00543 
00544 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
00545 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
00546 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
00547 
00548 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
00549 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
00550 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
00551 
00552 /// Selects and interleaves the lower two single-precision, floating-point
00553 /// values from a and b (see _mm_unpacklo_ps)
00554 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00555 {
00556     // sce_vectormath_xayb(in_vec1, in_vec2)
00557     float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
00558     float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
00559     float32x2x2_t xa_yb = vtrn_f32( xy, ab );
00560     AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
00561     return xayb;
00562 }
00563 
00564 /// Selects and interleaves the upper two single-precision, floating-point
00565 /// values from a and b (see _mm_unpackhi_ps)
00566 AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32( const AKSIMD_V4F32& in_vec1, const AKSIMD_V4F32& in_vec2 )
00567 {
00568     //return sce_vectormath_zcwd( in_vec1, in_vec2 );
00569     float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
00570     float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
00571     float32x2x2_t zc_wd = vtrn_f32( zw, cd );
00572     AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
00573     return zcwd;
00574 }
00575 
00576 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
00577 /// integers and saturates (see _mm_packs_epi32)
00578 AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32( const AKSIMD_V4I32& in_vec1, const AKSIMD_V4I32& in_vec2 )
00579 {
00580     int16x4_t   vec1_16 = vqmovn_s32( in_vec1 );
00581     int16x4_t   vec2_16 = vqmovn_s32( in_vec2 );
00582     int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
00583     return vreinterpretq_s32_s16( result );
00584 }
00585 
00586 /// V1 = {a,b}   =>   VR = {b,c}
00587 /// V2 = {c,d}   =>
00588 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
00589 
00590 /// V1 = {a,b}   =>   V1 = {a,c}
00591 /// V2 = {c,d}   =>   V2 = {b,d}
00592 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
00593 
00594 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
00595 
00596 /// V1 = {a,b}   =>   VR = {b,a}
00597 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
00598 
00599 //@}
00600 ////////////////////////////////////////////////////////////////////////
00601 
00602 ////////////////////////////////////////////////////////////////////////
00603 /// @name AKSIMD vector comparison
00604 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms. 
00605 //@{
00606 
00607 #define AKSIMD_CMP_CTRLMASK uint32x4_t
00608 
00609 /// Compare each float element and return control mask.
00610 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
00611 
00612 /// Compare each float element and return control mask.
00613 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
00614 
00615 /// Compare each float element and return control mask.
00616 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
00617 
00618 /// Compare each float element and return control mask.
00619 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
00620 
00621 /// Compare each integer element and return control mask.
00622 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
00623 
00624 /// Compare each float element and return control mask.
00625 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
00626 
00627 /// Compare each integer element and return control mask.
00628 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
00629 
00630 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
00631 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
00632 
00633 // (cond1 >= cond2) ? b : a.
00634 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
00635 
00636 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
00637 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
00638 
00639 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
00640 
00641 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4UI32& in_vec1 )
00642 {
00643 #ifdef AKSIMD_DECLARE_V4F32
00644     static const AKSIMD_DECLARE_V4I32(movemask, 1, 2, 4, 8);
00645     static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
00646 #else
00647     static const uint32x4_t movemask = { 1, 2, 4, 8 };
00648     static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
00649 #endif
00650 
00651     uint32x4_t t0 = in_vec1;
00652     uint32x4_t t1 = vtstq_u32(t0, highbit);
00653     uint32x4_t t2 = vandq_u32(t1, movemask);
00654     uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
00655     return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
00656 }
00657 
00658 #ifndef AK_WIN
00659 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec1 )
00660 {
00661     return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
00662 }
00663 #endif
00664 
00665 //@}
00666 ////////////////////////////////////////////////////////////////////////
00667 
00668 #endif //_AKSIMD_ARM_NEON_H_
00669