Version
menu_open
link
Wwise SDK 2021.1.14
AkSimdAvx.h
Go to the documentation of this file.
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Version: v2021.1.14 Build: 6590
25  Copyright (c) 2006-2023 Audiokinetic Inc.
26 *******************************************************************************/
27 
28 // AkSimdAvx.h
29 
30 /// \file
31 /// AKSIMD - AVX implementation
32 
33 #ifndef _AK_SIMD_AVX_H_
34 #define _AK_SIMD_AVX_H_
35 
38 
39 #if defined(AKSIMD_AVX_SUPPORTED)
40 
41 #include <immintrin.h>
42 
43 ////////////////////////////////////////////////////////////////////////
44 /// @name AKSIMD types
45 //@{
46 
47 typedef float AKSIMD_F32; ///< 32-bit float
48 typedef __m256 AKSIMD_V8F32; ///< Vector of 8 32-bit floats
49 typedef __m256d AKSIMD_V4F64; ///< Vector of 4 64-bit floats
50 typedef __m256i AKSIMD_V8I32; ///< Vector of 8 32-bit signed integers
51 typedef AKSIMD_V8F32 AKSIMD_V8COND; ///< Vector of 8 comparison results
52 typedef AKSIMD_V8F32 AKSIMD_V8FCOND; ///< Vector of 8 comparison results
53 typedef AKSIMD_V8I32 AKSIMD_V8ICOND;
54 
55 //@}
56 ////////////////////////////////////////////////////////////////////////
57 
58 
59 ////////////////////////////////////////////////////////////////////////
60 /// @name AKSIMD loading / setting
61 //@{
62 
63 /// Loads eight single-precision floating-point values from memory.
64 /// The address does not need to be 32-byte aligned (see _mm_loadu_ps).
65 /// On every modern x86 processor this performs the same as an aligned load.
66 #define AKSIMD_LOAD_V8F32( __addr__ ) _mm256_loadu_ps( (AkReal32*)(__addr__) )
67 
68 /// Loads a single single-precision, floating-point value, copying it into
69 /// all eight words (see _mm_load1_ps, _mm_load_ps1)
70 #define AKSIMD_LOAD1_V8F32( __scalar__ ) _mm256_broadcast_ss( &(__scalar__) )
71 
72 /// Loads a single double-precision, floating-point value, and copies it into
73 /// all elements of the vector (see _mm_load_pd1)
74 #define AKSIMD_LOAD1_V4F64( __scalar__ ) _mm256_castpd_ps(_mm256_broadcast_sd( &(__scalar__) ))
75 
76 /// Sets the eight single-precision, floating-point values to in_value (see
77 /// _mm_set1_ps, _mm_set_ps1)
78 #define AKSIMD_SET_V8F32( __scalar__ ) _mm256_set1_ps( (__scalar__) )
79 
80 /// Populates the full vector with the 8 floating point values provided
81 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
82 
83 /// Sets the eight single-precision, floating-point values to zero (see
84 /// _mm_setzero_ps)
85 #define AKSIMD_SETZERO_V8F32() _mm256_setzero_ps()
86 
87 /// Loads a single-precision, floating-point value into the low word
88 /// and clears the upper seven words.
89 /// r0 := *p; r1...r7 := 0.0 (see _mm_load_ss)
90 #define AKSIMD_LOAD_SS_V8F32( __addr__ ) _mm256_zextps128_ps256(_mm_load_ss( (__addr__) ))
91 
92 /// Loads the two m128i's provided into the output m256i a
93 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
94 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
95 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them
96 #define AKSIMD_SET_V2F128( m1, m2) _mm256_setr_m128(m1, m2)
97 
98 #define AKSIMD_INSERT_V2F128( a, m128, idx) _mm256_insertf128_ps(a, m128, idx)
99 
100 //@}
101 ////////////////////////////////////////////////////////////////////////
102 
103 
104 ////////////////////////////////////////////////////////////////////////
105 /// @name AKSIMD storing
106 //@{
107 
108 /// Stores eight single-precision, floating-point values.
109 /// The address does not need to be 32-byte aligned (see _mm_storeu_ps).
110 /// On every modern x86 processor this performs the same as an aligned store.
111 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
112 
113 /// Stores the lower single-precision, floating-point value.
114 /// *p := a0 (see _mm_store_ss)
115 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) )
116 
117 //@}
118 ////////////////////////////////////////////////////////////////////////
119 
120 ////////////////////////////////////////////////////////////////////////
121 /// @name AKSIMD shuffling
122 //@{
123 
124 /// Selects eight specific single-precision, floating-point values from
125 /// a and b, based on the mask i within 128-bit lanes (see _mm256_shuffle_ps)
126 /// This means that the AKSIMD_SHUFFLE operand still picks 1 of 4 32b components
127 /// inside of each of the 2 128b lanes.
128 // Usage: AKSIMD_SHUFFLE_V8F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
129 #define AKSIMD_SHUFFLE_V8F32( a, b, i ) _mm256_shuffle_ps( a, b, i )
130 
131 /// For each 128b lane, Swap the 2 lower floats together and the 2 higher floats together. ( h g f e d c b a -> g h e f c d a b )
132 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
133 
134 /// For each 128b lane, Swap the 2 lower floats with the 2 higher floats. ( h g f e d c b a -> f e h g b a d c )
135 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
136 
137 /// For each 128b lane, barrel-shift all floats by one. ( h g f e d c b a -> e h g f a d c b )
138 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
139 
140 /// For each 128b lane, duplicates the odd items into the even items ( h g f e d c b a -> h h f f d d b b )
141 #define AKSIMD_DUP_V8_ODD(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
142 
143 /// For each 128b lane, duplicates the even items into the odd items ( h g f e d c b a -> g g e e c c a a )
144 #define AKSIMD_DUP_V8_EVEN(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
145 
146 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in i, and return the results
147 #define AKSIMD_SHUFFLE_V8I32( a, b, i ) _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), i ))
148 
149 /// single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
150 #define AKSIMD_PERMUTEVAR_V8F32(a, b) _mm256_permutevar_ps(a, b)
151 
152 // Macro for selection parameter for AKSIMD_PERMUTE_2X128_V8F32()
153 #define AKSIMD_PERMUTE128( l1, l0 ) (((l1) << 4) | (l0))
154 
155 /// For each 128b lane, select one of the four input 128b lanes across a and b,
156 /// based on the mask i. AKSIMD_SHUFFLE can still be directly used as a control
157 #define AKSIMD_PERMUTE_2X128_V8F32( a, b, i ) _mm256_permute2f128_ps(a, b, i)
158 
159 /// Selects the lower of each of the 128b lanes in a and b to be the result ( B A ), ( D C ) -> ( C A )
160 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0))
161 
162 /// Selects the higher of each of the 128b lanes in a and b to be the result ( B A ), ( D C) -> ( D B )
163 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1))
164 
165 /// Gets the specified 128b lane from a and stores it in the result
166 #define AKSIMD_EXTRACT_V2F128( a, i ) _mm256_extractf128_ps(a, i)
167 
168 /// Rotate the 4x4 vectors in each of the 128b lanes. After rotation:
169 /// A[7:0] = D[4] C[4] B[4] A[4] D[0] C[0] B[0] A[0]
170 /// B[7:0] = D[5] C[5] B[5] A[5] D[1] C[1] B[1] A[1]
171 /// C[7:0] = D[6] C[6] B[6] A[6] D[2] C[2] B[2] A[2]
172 /// D[7:0] = D[7] C[7] B[7] A[7] D[3] C[3] B[3] A[3]
173 AkForceInline void AKSIMD_TRANSPOSE8X4_V8F32(AKSIMD_V8F32& A, AKSIMD_V8F32& B, AKSIMD_V8F32& C, AKSIMD_V8F32& D)
174 {
175  AKSIMD_V8F32 tmp1, tmp2, tmp3, tmp4;
176  tmp1 = AKSIMD_SHUFFLE_V8F32(A, B, AKSIMD_SHUFFLE(1, 0, 1, 0));
177  tmp2 = AKSIMD_SHUFFLE_V8F32(A, B, AKSIMD_SHUFFLE(3, 2, 3, 2));
178  tmp3 = AKSIMD_SHUFFLE_V8F32(C, D, AKSIMD_SHUFFLE(1, 0, 1, 0));
179  tmp4 = AKSIMD_SHUFFLE_V8F32(C, D, AKSIMD_SHUFFLE(3, 2, 3, 2));
180 
181  A = AKSIMD_SHUFFLE_V8F32(tmp1, tmp3, AKSIMD_SHUFFLE(2, 0, 2, 0));
182  B = AKSIMD_SHUFFLE_V8F32(tmp1, tmp3, AKSIMD_SHUFFLE(3, 1, 3, 1));
183  C = AKSIMD_SHUFFLE_V8F32(tmp2, tmp4, AKSIMD_SHUFFLE(2, 0, 2, 0));
184  D = AKSIMD_SHUFFLE_V8F32(tmp2, tmp4, AKSIMD_SHUFFLE(3, 1, 3, 1));
185 }
186 
187 //@}
188 ////////////////////////////////////////////////////////////////////////
189 
190 
191 ////////////////////////////////////////////////////////////////////////
192 /// @name AKSIMD arithmetic
193 //@{
194 
195 /// Subtracts the eight single-precision, floating-point values of
196 /// a and b (a - b) (see _mm_sub_ps)
197 #define AKSIMD_SUB_V8F32( a, b ) _mm256_sub_ps( a, b )
198 
199 /// Subtracts the lower single-precision, floating-point values of a and b.
200 /// The upper three single-precision, floating-point values are passed through from a.
201 /// r0 := a0 - b0 ; r1...r7 := a1...a7 (see _mm_sub_ss)
202 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
203 
204 /// Adds the eight single-precision, floating-point values of
205 /// a and b (see _mm_add_ps)
206 #define AKSIMD_ADD_V8F32( a, b ) _mm256_add_ps( a, b )
207 
208 /// Performs alternatiing subs and adds of the eight single-precision,
209 /// floating-point values of a and b (see _mm_addsub_ps)
210 #define AKSIMD_ADDSUB_V8F32( a, b ) _mm256_addsub_ps( a, b )
211 
212 /// Adds the lower single-precision, floating-point values of a and b; the
213 /// upper three single-precision, floating-point values are passed through from a.
214 /// r0 := a0 + b0; r1...r7 := a1...a7 (see _mm_add_ss)
215 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
216 
217 /// Multiplies the eight single-precision, floating-point values
218 /// of a and b (see _mm_mul_ps)
219 #define AKSIMD_MUL_V8F32( a, b ) _mm256_mul_ps( a, b )
220 
221 #define AKSIMD_DIV_V8F32( a, b ) _mm256_div_ps( a, b )
222 
223 /// Multiplies the lower single-precision, floating-point values of
224 /// a and b; the upper three single-precision, floating-point values
225 /// are passed through from a.
226 /// r0 := a0 * b0; r1...r7 := a1...a7 (see _mm_mul_ss)
227 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) )
228 
229 /// Computes the minima of the eight single-precision, floating-point
230 /// values of a and b (see _mm_min_ps)
231 #define AKSIMD_MIN_V8F32( a, b ) _mm256_min_ps( a, b )
232 
233 /// Computes the maximums of the eight single-precision, floating-point
234 /// values of a and b (see _mm_max_ps)
235 #define AKSIMD_MAX_V8F32( a, b ) _mm256_max_ps( a, b )
236 
237 /// Computes the absolute value
238 #define AKSIMD_ABS_V8F32( a ) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
239 
240 /// Changes the sign
241 #define AKSIMD_NEG_V8F32( __a__ ) _mm256_xor_ps(_mm256_set1_ps(-0.f), __a__)
242 
243 /// Vector square root aproximation (see _mm_sqrt_ps)
244 #define AKSIMD_SQRT_V8F32( __a__ ) _mm256_sqrt_ps( (__a__) )
245 
246 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
247 #define AKSIMD_RSQRT_V8F32( __a__ ) _mm256_rsqrt_ps( (__a__) )
248 
249 /// Vector reciprocal
250 #define AKSIMD_RECIP_V8F32( __a__ ) _mm256_rcp_ps( (__a__) )
251 
252 /// Vector ceil
253 #define AKSIMD_CEIL_V8F32( __a__ ) _mm256_ceil_ps( (__a__) )
254 
255 #define AKSIMD_XOR_V8F32( a, b ) _mm256_xor_ps(a,b)
256 #define AKSIMD_OR_V8F32( a, b ) _mm256_or_ps(a,b)
257 #define AKSIMD_AND_V8F32( a, b) _mm256_and_ps(a,b)
258 #define AKSIMD_NOT_V8F32( a ) _mm256_xor_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(~0)))
259 
260 /// horizontal add across the entire vector - vVec will be updated to contain the sum of every input element of vVec
261 /// \akwarning
262 /// Don't expect this to be very efficient.
263 /// \endakwarning
264 static AkForceInline AKSIMD_V8F32 AKSIMD_HORIZONTALADD_V8F32(AKSIMD_V8F32 vVec)
265 {
266  __m256 vAb = _mm256_shuffle_ps(vVec, vVec, 0xB1);
267  __m256 vHaddAb = _mm256_add_ps(vVec, vAb);
268  __m256 vHaddCd = _mm256_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
269  __m256 vHaddAbcd = _mm256_add_ps(vHaddAb, vHaddCd);
270  __m256 vHaddEfgh = _mm256_permute2f128_ps(vHaddAbcd, vHaddAbcd, 0x01);
271  __m256 vHaddAll = _mm256_add_ps(vHaddAbcd, vHaddEfgh);
272  return vHaddAll;
273 }
274 
275 /// Cross-platform SIMD multiplication of 8 complex data elements with interleaved real and imaginary parts
276 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(const AKSIMD_V8F32 cIn1, const AKSIMD_V8F32 cIn2)
277 {
278  __m256 real1Ext = _mm256_moveldup_ps(cIn1); // reals extended (a3, a3, a2, a2, a1, a1, a0, a0)
279  __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1); // shuf multiplicand (c3, d3, c2, d2, c1, d1, c0, d0)
280  __m256 imag1Ext = _mm256_movehdup_ps(cIn1); // multiplier imag (b3, b3, b2, b2, b1, b1, b0, b0)
281  __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf); // temp (b3c3, b3d3, b2c2, b2d2, b1c1, b1d1, b0c0, b0d0)
282  __m256 mul = _mm256_mul_ps(real1Ext, cIn2); // (a3d3, a3c3, a2d2, a2c2, a1d1, a1c1, a0d0, a0c0)
283  __m256 out = _mm256_addsub_ps(mul, temp); // final (a3d3+b3c3, a3c3-b3d3, a2d2+b2c2, a2c2-b2d2, a1d1+b1c1, a1c1-b1d1, a0d0+b0c0, a0c0-b0d0)
284  return out;
285 }
286 
287 //@}
288 ////////////////////////////////////////////////////////////////////////
289 
290 
291 ////////////////////////////////////////////////////////////////////////
292 /// @name AKSIMD packing / unpacking
293 //@{
294 
295 /// Selects and interleaves the lower two single-precision, floating-point
296 /// values from each 128-bit lane in a and b (see _mm_unpacklo_ps)
297 /// i.e. r0 := a0, r1 := b0, r2 := a1, r3 := b1, r4 := a4, r5 := b4, r6 := a5, r7 := b5
298 #define AKSIMD_UNPACKLO_V8F32( a, b ) _mm256_unpacklo_ps( a, b )
299 
300 /// Selects and interleaves the upper two single-precision, floating-point
301 /// values from each 128-bit lane a and b (see _mm_unpackhi_ps)
302 /// i.e. r0 := a2, r1 := b2, r2 := a3, r3 := b3, r4 := a6, r5 := b6, r6 := a7, r7 := b7
303 #define AKSIMD_UNPACKHI_V8F32( a, b ) _mm256_unpackhi_ps( a, b )
304 
305 //@}
306 ////////////////////////////////////////////////////////////////////////
307 
308 ////////////////////////////////////////////////////////////////////////
309 /// @name AKSIMD vector comparison
310 //@{
311 
312 #define AKSIMD_CMP_CTRLMASKV8 __m256
313 
314 /// Vector "<=" operation (see _mm_cmple_ps)
315 #define AKSIMD_LTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LE_OS )
316 
317 #define AKSIMD_LT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LT_OS )
318 
319 /// Vector ">=" operation (see _mm_cmple_ps)
320 #define AKSIMD_GTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GE_OS )
321 
322 #define AKSIMD_GT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GT_OS )
323 
324 /// Vector "==" operation (see _mm_cmpeq_ps)
325 #define AKSIMD_EQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_EQ_OS )
326 
327 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
328 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
329 {
330  return _mm256_blendv_ps(vA, vB, vMask);
331 }
332 
333 // (cond1 >= cond2) ? b : a.
334 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) )
335 
336 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
337 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) )
338 
339 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
340 
341 #define AKSIMD_MASK_V8F32( __a__ ) _mm256_movemask_ps( __a__ )
342 
343 // returns true if every element of the provided vector is zero
344 #define AKSIMD_TESTZERO_V8I32( __a__ ) (_mm256_testz_si256(__a__,__a__) != 0)
345 #define AKSIMD_TESTZERO_V8F32( __a__) AKSIMD_TESTZERO_V8I32(_mm256_castps_si256(__a__))
346 
347 // returns true if every element of the provided vector is one
348 #define AKSIMD_TESTONES_V8I32(__a__) (_mm256_testc_si256(__a__, _mm256_set1_epi32(~0)) != 0)
349 #define AKSIMD_TESTONES_V8F32( __a__) AKSIMD_TESTONES_V8I32(_mm256_castps_si256(__a__))
350 
351 //@}
352 ////////////////////////////////////////////////////////////////////////
353 
354 /// Loads 256-bit value (see _mm_loadu_si128)
355 /// On every modern x86 processor this performs the same as an aligned load.
356 #define AKSIMD_LOAD_V8I32( __addr__ ) _mm256_loadu_si256( (__addr__) )
357 
358 /// Sets the eight 32-bit integer values to zero (see _mm_setzero_si128)
359 #define AKSIMD_SETZERO_V8I32() _mm256_setzero_si256()
360 
361 /// Sets the provided scalar value at the first index of the vector, and zeroes everything else
362 #define AKSIMD_SET_V8I32( __scalar__ ) _mm256_set1_epi32( (__scalar__) )
363 
364 /// Populates the full vector with the 8 values provided
365 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
366 
367 /// Loads the two m128i's provided into the output m256i a
368 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
369 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
370 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them
371 #define AKSIMD_SET_V2I128(m1, m2) _mm256_setr_m128i(m1, m2)
372 
373 /// Stores eight 32-bit integer values.
374 /// The address does not need to be 32-byte aligned (see _mm_storeu_si128).
375 /// On every modern x86 processor this performs the same as an aligned load.
376 #define AKSIMD_STORE_V8I32( __addr__, __vec__ ) _mm256_storeu_si256( (__addr__), (__vec__) )
377 
378 ////////////////////////////////////////////////////////////////////////
379 /// @name AKSIMD conversion
380 //@{
381 
382 /// Converts the eight signed 32-bit integer values of a to single-precision,
383 /// floating-point values (see _mm_cvtepi32_ps)
384 #define AKSIMD_CONVERT_V8I32_TO_V8F32( __vec__ ) _mm256_cvtepi32_ps( (__vec__) )
385 
386 /// Converts the eight single-precision, floating-point values of a to signed
387 /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
388 #define AKSIMD_ROUND_V8F32_TO_V8I32( __vec__ ) _mm256_cvtps_epi32( (__vec__) )
389 
390 /// Converts the eight single-precision, floating-point values of a to signed
391 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
392 #define AKSIMD_TRUNCATE_V8F32_TO_V8I32( __vec__ ) _mm256_cvttps_epi32( (__vec__) )
393 
394 /// Converts the eight half-precision floating-point values of vec to
395 /// eight full-precision floating-point values
396 /// WARNING: Using this requires F16C support, which is not guaranteed on AVX
397 #define AKSIMD_CONVERT_V8F16_TO_V8F32( __vec__ ) _mm256_cvtph_ps( (__vec__) )
398 
399 /// Converts the eight single-precision, floating-point values of vec to
400 /// eight half-precision floating-point values
401 /// WARNING: Using this requires F16C support, which is not guaranteed on AVX
402 #define AKSIMD_CONVERT_V8F32_TO_V8F16( __vec__ ) _mm256_cvtps_ph(__vec__, (_MM_FROUND_TO_NEAREST_INT ) )
403 
404 //@}
405 ////////////////////////////////////////////////////////////////////////
406 
407 ////////////////////////////////////////////////////////////////////////
408 /// @name AKSIMD cast
409 //@{
410 
411 /// Cast vector of type AKSIMD_V4F64 to type AKSIMD_V8F32. This intrinsic is only
412 /// used for compilation and does not generate any instructions, thus it has zero latency.
413 #define AKSIMD_CAST_V4F64_TO_V8F32( __vec__ ) _mm256_castpd_ps(__vec__)
414 
415 /// Cast vector of type AKSIMD_V4F64 to type AKSIMD_V8I32. This intrinsic is only
416 /// used for compilation and does not generate any instructions, thus it has zero latency.
417 #define AKSIMD_CAST_V4F64_TO_V8I32( __vec__ ) _mm256_castpd_si256(__vec__)
418 
419 /// Cast vector of type AKSIMD_V8F32 to type AKSIMD_V4F64. This intrinsic is only
420 /// used for compilation and does not generate any instructions, thus it has zero latency.
421 #define AKSIMD_CAST_V8F32_TO_V4F64( __vec__ ) _mm256_castps_pd(__vec__)
422 
423 /// Cast vector of type AKSIMD_V8F32 to type AKSIMD_V8I32. This intrinsic is only
424 /// used for compilation and does not generate any instructions, thus it has zero latency.
425 #define AKSIMD_CAST_V8F32_TO_V8I32( __vec__ ) _mm256_castps_si256(__vec__)
426 
427 /// Cast vector of type AKSIMD_V8I32 to type AKSIMD_V4F64. This intrinsic is only
428 /// used for compilation and does not generate any instructions, thus it has zero latency.
429 #define AKSIMD_CAST_V8I32_TO_V4F64( __vec__ ) _mm256_castsi256_pd(__vec__)
430 
431 /// Cast vector of type AKSIMD_V8I32 to type AKSIMD_V8F32. This intrinsic is only
432 /// used for compilation and does not generate any instructions, thus it has zero latency.
433 #define AKSIMD_CAST_V8I32_TO_V8F32( __vec__ ) _mm256_castsi256_ps(__vec__)
434 
435 //@}
436 ////////////////////////////////////////////////////////////////////////
437 #endif //_AK_SIMD_AVX_H_
438 #endif
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:536
float32_t AKSIMD_F32
32-bit float
Definition: AkSimd.h:71
#define AkForceInline
Definition: AkTypes.h:60

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise