Table of Contents

Wwise SDK 2019.1.6
AkSimd.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: <VERSION> Build: <BUILDNUMBER>
25  Copyright (c) <COPYRIGHTYEAR> Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimd.h
29 
30 /// \file
31 /// AKSIMD - arm_neon implementation
32 
33 #ifndef _AKSIMD_ARM_NEON_H_
34 #define _AKSIMD_ARM_NEON_H_
35 
36 #if defined _MSC_VER && defined _M_ARM64
37  #include <arm64_neon.h>
38 #else
39  #include <arm_neon.h>
40 #endif
42 
43 // Platform specific defines for prefetching
44 
45 /*
46 // ??????
47 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
48 // ??????
49 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
50 /// Cross-platform memory prefetch of effective address assuming non-temporal data
51 // ??????
52 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
53 */
54 
55 ////////////////////////////////////////////////////////////////////////
56 /// @name Platform specific memory size alignment for allocation purposes
57 //@{
58 
59 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
60 
61 //@}
62 ////////////////////////////////////////////////////////////////////////
63 
64 ////////////////////////////////////////////////////////////////////////
65 /// @name AKSIMD types
66 //@{
67 
68 typedef int32x4_t AKSIMD_V4I32; ///< Vector of 4 32-bit signed integers
69 typedef int16x8_t AKSIMD_V8I16; ///< Vector of 8 16-bit signed integers
70 typedef int16x4_t AKSIMD_V4I16; ///< Vector of 4 16-bit signed integers
71 typedef uint32x4_t AKSIMD_V4UI32; ///< Vector of 4 32-bit unsigned signed integers
72 typedef uint32x2_t AKSIMD_V2UI32; ///< Vector of 2 32-bit unsigned signed integers
73 typedef int32x2_t AKSIMD_V2I32; ///< Vector of 2 32-bit signed integers
74 typedef float32_t AKSIMD_F32; ///< 32-bit float
75 typedef float32x2_t AKSIMD_V2F32; ///< Vector of 2 32-bit floats
76 typedef float32x4_t AKSIMD_V4F32; ///< Vector of 4 32-bit floats
77 
78 typedef uint32x4_t AKSIMD_V4COND; ///< Vector of 4 comparison results
79 typedef uint32x4_t AKSIMD_V4ICOND; ///< Vector of 4 comparison results
80 typedef uint32x4_t AKSIMD_V4FCOND; ///< Vector of 4 comparison results
81 
82 #if defined(AK_CPU_ARM_NEON)
83 typedef float32x2x2_t AKSIMD_V2F32X2;
84 typedef float32x4x2_t AKSIMD_V4F32X2;
85 typedef float32x4x4_t AKSIMD_V4F32X4;
86 #endif
87 
88 //@}
89 ////////////////////////////////////////////////////////////////////////
90 
91 ////////////////////////////////////////////////////////////////////////
92 /// @name AKSIMD loading / setting
93 //@{
94 
95 /// Loads four single-precision, floating-point values (see _mm_load_ps)
96 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
97 
98 /// Loads four single-precision floating-point values from unaligned
99 /// memory (see _mm_loadu_ps)
100 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
101 
102 /// Loads a single single-precision, floating-point value, copying it into
103 /// all four words (see _mm_load1_ps, _mm_load_ps1)
104 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
105 
106 /// Sets the four single-precision, floating-point values to __scalar__ (see
107 /// _mm_set1_ps, _mm_set_ps1)
108 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
109 
110 /// Sets the four integer values to __scalar__
111 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
112 
113 /// Sets the four single-precision, floating-point values to zero (see
114 /// _mm_setzero_ps)
115 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
116 
117 /// Loads a single-precision, floating-point value into the low word
118 /// and clears the upper three words.
119 /// r0 := *p; r1 := 0.0 ; r2 := 0.0 ; r3 := 0.0 (see _mm_load_ss)
120 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
121 
122 /// Loads four 32-bit signed integer values (aligned)
123 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
124 
125 /// Loads 8 16-bit signed integer values (aligned)
126 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
127 
128 /// Loads 4 16-bit signed integer values (aligned)
129 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
130 
131 /// Loads unaligned 128-bit value (see _mm_loadu_si128)
132 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
133 /// Sets the four 32-bit integer values to zero (see _mm_setzero_si128)
134 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
135 
136 /// Loads two single-precision, floating-point values
137 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
138 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
139 
140 /// Sets the two single-precision, floating-point values to __scalar__
141 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
142 
143 /// Sets the two single-precision, floating-point values to zero
144 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
145 
146 /// Loads data from memory and de-interleaves
147 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
148 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
149 
150 /// Loads data from memory and de-interleaves; only selected lane
151 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
152 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
153 
154 //@}
155 ////////////////////////////////////////////////////////////////////////
156 
157 
158 ////////////////////////////////////////////////////////////////////////
159 /// @name AKSIMD storing
160 //@{
161 
162 /// Stores four single-precision, floating-point values. The address must be 16-byte aligned
163 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
164 
165 /// Stores four single-precision, floating-point values. The address does not need to be 16-byte aligned.
166 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
167 
168 /// Stores the lower single-precision, floating-point value.
169 /// *p := a0 (see _mm_store_ss)
170 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
171 
172 /// Stores four 32-bit integer values. The address must be 16-byte aligned.
173 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
174 
175 /// Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
176 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
177 
178 /// Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
179 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
180 
181 /// Stores two single-precision, floating-point values. The address must be 16-byte aligned.
182 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
183 
184 /// Stores data by interleaving into memory
185 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
186 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
187 
188 //@}
189 ////////////////////////////////////////////////////////////////////////
190 
191 
192 ////////////////////////////////////////////////////////////////////////
193 /// @name AKSIMD conversion
194 //@{
195 
196 /// Converts the four signed 32-bit integer values of a to single-precision,
197 /// floating-point values (see _mm_cvtepi32_ps)
198 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
199 
200 /// Converts the four single-precision, floating-point values of a to signed
201 /// 32-bit integer values (see _mm_cvtps_epi32)
202 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( __vec__ )
203 
204 /// Converts the four single-precision, floating-point values of a to signed
205 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
206 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
207 
208 /// Converts the two single-precision, floating-point values of a to signed
209 /// 32-bit integer values
210 #define AKSIMD_CONVERT_V2F32_TO_V2I32( __vec__ ) vcvt_s32_f32( __vec__ )
211 
212 //@}
213 ////////////////////////////////////////////////////////////////////////
214 
215 
216 ////////////////////////////////////////////////////////////////////////
217 /// @name AKSIMD logical operations
218 //@{
219 
220 /// Computes the bitwise AND of the 128-bit value in a and the
221 /// 128-bit value in b (see _mm_and_si128)
222 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
223 
224 /// Compares the 8 signed 16-bit integers in a and the 8 signed
225 /// 16-bit integers in b for greater than (see _mm_cmpgt_epi16)
226 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
227  vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
228 
229 /// Compares for less than or equal (see _mm_cmple_ps)
230 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
231 
232 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
233 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
234 
235 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
236 
237 static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32( const AKSIMD_V4F32& in_vec0, const AKSIMD_V4F32& in_vec1 )
238 {
239  uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
240  uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
241  uint32x4_t res = veorq_u32(t0, t1);
242  return vreinterpretq_f32_u32(res);
243 }
244 
245 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
246 //@}
247 ////////////////////////////////////////////////////////////////////////
248 
249 
250 ////////////////////////////////////////////////////////////////////////
251 /// @name AKSIMD shifting
252 //@{
253 
254 /// Shifts the 4 signed or unsigned 32-bit integers in a left by
255 /// in_shiftBy bits while shifting in zeros (see _mm_slli_epi32)
256 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
257  vshlq_n_s32( (__vec__), (__shiftBy__) )
258 
259 /// Shifts the 4 signed 32-bit integers in a right by in_shiftBy
260 /// bits while shifting in the sign bit (see _mm_srai_epi32)
261 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
262  vrshrq_n_s32( (__vec__), (__shiftBy__) )
263 
264 //@}
265 ////////////////////////////////////////////////////////////////////////
266 
267 
268 ////////////////////////////////////////////////////////////////////////
269 /// @name AKSIMD shuffling
270 //@{
271 
272 // Macro for combining two vector of 2 elements into one vector of
273 // 4 elements.
274 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
275 
276 // Macro for shuffle parameter for AKSIMD_SHUFFLE_V4F32() (see _MM_SHUFFLE)
277 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
278  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
279 
280 /// Selects four specific single-precision, floating-point values from
281 /// a and b, based on the mask i (see _mm_shuffle_ps)
282 // Usage: AKSIMD_SHUFFLE_V4F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
283 // If you get a link error, it's probably because the required
284 // _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > is not implemented in
285 // <AK/SoundEngine/Platforms/arm_neon/AkSimdShuffle.h>.
286 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
287  _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
288 
289 /// Barrel-shift all floats by one.
290 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
291 
292 // Various combinations of zyxw for _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw > are
293 // implemented in a separate header file to keep this one cleaner:
295 
296 /// Moves the upper two single-precision, floating-point values of b to
297 /// the lower two single-precision, floating-point values of the result.
298 /// The upper two single-precision, floating-point values of a are passed
299 /// through to the result.
300 /// r3 := a3; r2 := a2; r1 := b3; r0 := b2 (see _mm_movehl_ps)
302 {
303  //return akshuffle_zwcd( xyzw, abcd );
304  AKSIMD_V2F32 zw = vget_high_f32( xyzw );
305  AKSIMD_V2F32 cd = vget_high_f32( abcd );
306  AKSIMD_V4F32 zwcd = vcombine_f32( zw , cd );
307  return zwcd;
308 }
309 
310 /// Moves the lower two single-precision, floating-point values of b to
311 /// the upper two single-precision, floating-point values of the result.
312 /// The lower two single-precision, floating-point values of a are passed
313 /// through to the result.
314 /// r3 := b1 ; r2 := b0 ; r1 := a1 ; r0 := a0 (see _mm_movelh_ps)
315 inline AKSIMD_V4F32 AKSIMD_MOVELH_V4F32( const AKSIMD_V4F32& xyzw, const AKSIMD_V4F32& abcd )
316 {
317  return vcombine_f32( vget_low_f32( xyzw ) , vget_low_f32( abcd ) );
318 }
319 
320 /// Swap the 2 lower floats together and the 2 higher floats together.
321 //#define AKSIMD_SHUFFLE_BADC( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
322 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
323 
324 /// Swap the 2 lower floats with the 2 higher floats.
325 //#define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
326 #define AKSIMD_SHUFFLE_CDAB( __a__ ) vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
327 
328 /// Duplicates the odd items into the even items (d c b a -> d d b b )
329 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
330 
331 /// Duplicates the even items into the odd items (d c b a -> c c a a )
332 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
333 
334 //@}
335 ////////////////////////////////////////////////////////////////////////
336 
337 
338 ////////////////////////////////////////////////////////////////////////
339 /// @name AKSIMD arithmetic
340 //@{
341 
342 /// Subtracts the four single-precision, floating-point values of
343 /// a and b (a - b) (see _mm_sub_ps)
344 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
345 
346 /// Subtracts the two single-precision, floating-point values of
347 /// a and b
348 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
349 
350 /// Subtracts the lower single-precision, floating-point values of a and b.
351 /// The upper three single-precision, floating-point values are passed through from a.
352 /// r0 := a0 - b0 ; r1 := a1 ; r2 := a2 ; r3 := a3 (see _mm_sub_ss)
353 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
354  vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
355 
356 /// Adds the four single-precision, floating-point values of
357 /// a and b (see _mm_add_ps)
358 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
359 
360 /// Adds the two single-precision, floating-point values of
361 /// a and b
362 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
363 
364 /// Adds the four integers of a and b
365 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
366 
367 /// Multiplies the 4 low-parts of both operand into the 4 32-bit integers (no overflow)
368 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
369 
370 /// Compare the content of four single-precision, floating-point values of
371 /// a and b
372 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
373 
374 /// Compare the content of two single-precision, floating-point values of
375 /// a and b
376 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
377 
378 /// Adds the lower single-precision, floating-point values of a and b; the
379 /// upper three single-precision, floating-point values are passed through from a.
380 /// r0 := a0 + b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
381 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
382  vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
383 
384 /// Multiplies the four single-precision, floating-point values
385 /// of a and b (see _mm_mul_ps)
386 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
387 
388 /// Multiplies the four single-precision, floating-point values of a
389 /// by the single-precision, floating-point scalar b
390 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
391 
392 /// Rough estimation of division
394 {
395  AKSIMD_V4F32 inv = vrecpeq_f32(b);
396  AKSIMD_V4F32 restep = vrecpsq_f32(b, inv);
397  inv = vmulq_f32(restep, inv);
398  return vmulq_f32(a, inv);
399 }
400 
401 /// Multiplies the two single-precision, floating-point values
402 /// of a and b
403 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
404 
405 /// Multiplies the two single-precision, floating-point values of a
406 /// by the single-precision, floating-point scalar b
407 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
408 
409 /// Multiplies the lower single-precision, floating-point values of
410 /// a and b; the upper three single-precision, floating-point values
411 /// are passed through from a.
412 /// r0 := a0 * b0; r1 := a1; r2 := a2; r3 := a3 (see _mm_add_ss)
413 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
414  vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
415 
416 /// Vector multiply-add operation.
417 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
418 
419 /// Vector multiply-substract operation. Careful: vmlsq_f32 does c-(a*b) and not the expected (a*b)-c
420 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) \
421  AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
422 
423 
424 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) \
425  AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
426 
427 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) \
428  AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
429 
430 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
431 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
432 
433 /// Vector multiply-add operation.
435 {
436  return AKSIMD_ADD_SS_V4F32( AKSIMD_MUL_SS_V4F32( __a__, __b__ ), __c__ );
437 }
438 
439 /// Computes the minima of the four single-precision, floating-point
440 /// values of a and b (see _mm_min_ps)
441 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
442 
443 /// Computes the minima of the two single-precision, floating-point
444 /// values of a and b
445 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
446 
447 /// Computes the maximums of the four single-precision, floating-point
448 /// values of a and b (see _mm_max_ps)
449 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
450 
451 /// Computes the maximums of the two single-precision, floating-point
452 /// values of a and b
453 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
454 
455 /// Returns absolute value
456 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
457 
458 /// Changes the sign
459 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
460 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
461 
462 /// Square root (4 floats)
463 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
464 
465 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
466 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
467 
468 /// Square root (2 floats)
469 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
470 
471 /// Faked in-place vector horizontal add.
472 /// \akwarning
473 /// Don't expect this to be very efficient.
474 /// \endakwarning
476 {
477  AKSIMD_V4F32 vHighLow = AKSIMD_MOVEHL_V4F32(vVec, vVec);
478  vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
479  vHighLow = AKSIMD_SHUFFLE_V4F32(vVec, vVec, 0x55);
480  vVec = AKSIMD_ADD_V4F32(vVec, vHighLow);
481 }
482 
483 /// Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts
484 
485 #if defined(AK_IOS)
486 
487 // V2 implementation (faster 'cause ARM processors actually have an x2 pipeline)
488 
490 {
491  static const AKSIMD_V2F32 vSign = { -1.f, 1.f };
492 
493  AKSIMD_V2F32 vCIn1a = vget_low_f32( vCIn1 );
494  AKSIMD_V2F32 vCIn2a = vget_low_f32( vCIn2 );
495  AKSIMD_V2F32 vTmpa0 = vmul_n_f32( vCIn2a, vCIn1a[0] );
496  AKSIMD_V2F32 vTmpa1 = vmul_n_f32( vCIn2a, vCIn1a[1] );
497  vTmpa1 = vrev64_f32( vTmpa1 );
498  vTmpa1 = vmul_f32( vTmpa1, vSign );
499  vTmpa0 = vadd_f32( vTmpa0, vTmpa1 );
500 
501  AKSIMD_V2F32 vCIn1b = vget_high_f32( vCIn1 );
502  AKSIMD_V2F32 vCIn2b = vget_high_f32( vCIn2 );
503  AKSIMD_V2F32 vTmpb0 = vmul_n_f32( vCIn2b, vCIn1b[0] );
504  AKSIMD_V2F32 vTmpb1 = vmul_n_f32( vCIn2b, vCIn1b[1] );
505  vTmpb1 = vrev64_f32( vTmpb1 );
506  vTmpb1 = vmul_f32( vTmpb1, vSign );
507  vTmpb0 = vadd_f32( vTmpb0, vTmpb1 );
508 
509  return vcombine_f32( vTmpa0, vTmpb0 );
510 }
511 
512 #else
513 
514 // V4 implementation (kept in case future ARM processors actually have an x4 pipeline)
515 
517 {
518 #ifdef AKSIMD_DECLARE_V4F32
519  static const AKSIMD_DECLARE_V4F32( vSign, 1.f, -1.f, 1.f, -1.f );
520 #else
521  static const AKSIMD_V4F32 vSign = { 1.f, -1.f, 1.f, -1.f };
522 #endif
523 
524  AKSIMD_V4F32 vTmp1 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(2,2,0,0));
525  vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
526  AKSIMD_V4F32 vTmp2 = AKSIMD_SHUFFLE_V4F32( vCIn1, vCIn1, AKSIMD_SHUFFLE(3,3,1,1));
527  vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
528  vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vCIn2 );
529  vTmp2 = AKSIMD_SHUFFLE_BADC( vTmp2 );
530  vTmp2 = AKSIMD_ADD_V4F32( vTmp2, vTmp1 );
531  return vTmp2;
532 }
533 
534 #endif
535 
536 //@}
537 ////////////////////////////////////////////////////////////////////////
538 
539 
540 ////////////////////////////////////////////////////////////////////////
541 /// @name AKSIMD packing / unpacking
542 //@{
543 
544 /// Interleaves the lower 4 signed or unsigned 16-bit integers in a with
545 /// the lower 4 signed or unsigned 16-bit integers in b (see _mm_unpacklo_epi16)
546 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
547 
548 /// Interleaves the upper 4 signed or unsigned 16-bit integers in a with
549 /// the upper 4 signed or unsigned 16-bit integers in b (see _mm_unpackhi_epi16)
550 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
551 
552 /// Selects and interleaves the lower two single-precision, floating-point
553 /// values from a and b (see _mm_unpacklo_ps)
555 {
556  // sce_vectormath_xayb(in_vec1, in_vec2)
557  float32x2_t xy = vget_low_f32( in_vec1 /*xyzw*/ );
558  float32x2_t ab = vget_low_f32( in_vec2 /*abcd*/ );
559  float32x2x2_t xa_yb = vtrn_f32( xy, ab );
560  AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
561  return xayb;
562 }
563 
564 /// Selects and interleaves the upper two single-precision, floating-point
565 /// values from a and b (see _mm_unpackhi_ps)
567 {
568  //return sce_vectormath_zcwd( in_vec1, in_vec2 );
569  float32x2_t zw = vget_high_f32( in_vec1 /*xyzw*/ );
570  float32x2_t cd = vget_high_f32( in_vec2 /*abcd*/ );
571  float32x2x2_t zc_wd = vtrn_f32( zw, cd );
572  AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
573  return zcwd;
574 }
575 
576 /// Packs the 8 signed 32-bit integers from a and b into signed 16-bit
577 /// integers and saturates (see _mm_packs_epi32)
579 {
580  int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
581  int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
582  int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
583  return vreinterpretq_s32_s16( result );
584 }
585 
586 /// V1 = {a,b} => VR = {b,c}
587 /// V2 = {c,d} =>
588 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
589 
590 /// V1 = {a,b} => V1 = {a,c}
591 /// V2 = {c,d} => V2 = {b,d}
592 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
593 
594 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
595 
596 /// V1 = {a,b} => VR = {b,a}
597 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
598 
599 //@}
600 ////////////////////////////////////////////////////////////////////////
601 
602 ////////////////////////////////////////////////////////////////////////
603 /// @name AKSIMD vector comparison
604 /// Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.
605 //@{
606 
607 #define AKSIMD_CMP_CTRLMASK uint32x4_t
608 
609 /// Compare each float element and return control mask.
610 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
611 
612 /// Compare each float element and return control mask.
613 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
614 
615 /// Compare each float element and return control mask.
616 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
617 
618 /// Compare each float element and return control mask.
619 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
620 
621 /// Compare each integer element and return control mask.
622 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
623 
624 /// Compare each float element and return control mask.
625 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
626 
627 /// Compare each integer element and return control mask.
628 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
629 
630 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
631 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
632 
633 // (cond1 >= cond2) ? b : a.
634 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
635 
636 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
637 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
638 
639 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
640 
641 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4UI32& in_vec1 )
642 {
643 #ifdef AKSIMD_DECLARE_V4F32
644  static const AKSIMD_DECLARE_V4I32(movemask, 1, 2, 4, 8);
645  static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
646 #else
647  static const uint32x4_t movemask = { 1, 2, 4, 8 };
648  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
649 #endif
650 
651  uint32x4_t t0 = in_vec1;
652  uint32x4_t t1 = vtstq_u32(t0, highbit);
653  uint32x4_t t2 = vandq_u32(t1, movemask);
654  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
655  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
656 }
657 
658 #ifndef AK_WIN
659 static AkForceInline int AKSIMD_MASK_V4F32( const AKSIMD_V4F32& in_vec1 )
660 {
661  return AKSIMD_MASK_V4F32( vreinterpretq_u32_f32(in_vec1) );
662 }
663 #endif
664 
665 //@}
666 ////////////////////////////////////////////////////////////////////////
667 
668 #endif //_AKSIMD_ARM_NEON_H_
669 
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:74
float32x4x4_t AKSIMD_V4F32X4
Definition: AkSimd.h:85
int16x8_t AKSIMD_V8I16
Vector of 8 16-bit signed integers.
Definition: AkSimd.h:69
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKLO_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:554
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:393
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:78
uint32x4_t AKSIMD_V4ICOND
Vector of 4 comparison results.
Definition: AkSimd.h:79
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:358
static AkForceInline AKSIMD_V4F32 AKSIMD_XOR_V4F32(const AKSIMD_V4F32 &in_vec0, const AKSIMD_V4F32 &in_vec1)
Definition: AkSimd.h:237
AKSIMD_V4F32 AKSIMD_MOVEHL_V4F32(const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw)
Definition: AkSimd.h:301
uint32x2_t AKSIMD_V2UI32
Vector of 2 32-bit unsigned signed integers.
Definition: AkSimd.h:72
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:286
#define AKSIMD_MUL_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:413
#define AKSIMD_DECLARE_V4I32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:99
#define AKSIMD_SHUFFLE_BADC(__a__)
Swap the 2 lower floats together and the 2 higher floats together.
Definition: AkSimd.h:322
AkForceInline AKSIMD_V4F32 AKSIMD_UNPACKHI_V4F32(const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
Definition: AkSimd.h:566
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:76
#define AkForceInline
Force inlining.
Definition: AkTypes.h:62
#define AKSIMD_DECLARE_V4F32(_x, _a, _b, _c, _d)
Definition: AkSimd.h:95
int32x4_t AKSIMD_V4I32
Vector of 4 32-bit signed integers.
Definition: AkSimd.h:68
AkForceInline AKSIMD_V4I32 AKSIMD_PACKS_V4I32(const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)
Definition: AkSimd.h:578
float32x4x2_t AKSIMD_V4F32X2
Definition: AkSimd.h:84
static AkForceInline int AKSIMD_MASK_V4F32(const AKSIMD_V4UI32 &in_vec1)
Definition: AkSimd.h:641
uint32x4_t AKSIMD_V4UI32
Vector of 4 32-bit unsigned signed integers.
Definition: AkSimd.h:71
#define AKSIMD_MUL_V4F32(__a__, __b__)
Definition: AkSimd.h:386
float32x2x2_t AKSIMD_V2F32X2
Definition: AkSimd.h:83
float32x2_t AKSIMD_V2F32
Vector of 2 32-bit floats.
Definition: AkSimd.h:75
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:277
#define AKSIMD_ADD_SS_V4F32(__a__, __b__)
Definition: AkSimd.h:381
uint32x4_t AKSIMD_V4FCOND
Vector of 4 comparison results.
Definition: AkSimd.h:80
int16x4_t AKSIMD_V4I16
Vector of 4 16-bit signed integers.
Definition: AkSimd.h:70
AkForceInline AKSIMD_V4F32 AKSIMD_MADD_SS_V4F32(const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
Vector multiply-add operation.
Definition: AkSimd.h:434
static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL(AKSIMD_V4F32 vCIn1, AKSIMD_V4F32 vCIn2)
Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary par...
Definition: AkSimd.h:489
AKSIMD_V4F32 AKSIMD_MOVELH_V4F32(const AKSIMD_V4F32 &xyzw, const AKSIMD_V4F32 &abcd)
Definition: AkSimd.h:315
int32x2_t AKSIMD_V2I32
Vector of 2 32-bit signed integers.
Definition: AkSimd.h:73
static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 &vVec)
Definition: AkSimd.h:475