バージョン
menu

Wwise SDK 2025.1.2
AkSimdAvx.h
[詳解]
1 /*******************************************************************************
2 The content of this file includes portions of the AUDIOKINETIC Wwise Technology
3 released in source code form as part of the SDK installer package.
4 
5 Commercial License Usage
6 
7 Licensees holding valid commercial licenses to the AUDIOKINETIC Wwise Technology
8 may use this file in accordance with the end user license agreement provided
9 with the software or, alternatively, in accordance with the terms contained in a
10 written agreement between you and Audiokinetic Inc.
11 
12 Apache License Usage
13 
14 Alternatively, this file may be used under the Apache License, Version 2.0 (the
15 "Apache License"); you may not use this file except in compliance with the
16 Apache License. You may obtain a copy of the Apache License at
17 http://www.apache.org/licenses/LICENSE-2.0.
18 
19 Unless required by applicable law or agreed to in writing, software distributed
20 under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
21 OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for
22 the specific language governing permissions and limitations under the License.
23 
24  Copyright (c) 2025 Audiokinetic Inc.
25 *******************************************************************************/
26 
27 // AkSimdAvx.h
28 
29 /// \file
30 /// AKSIMD - AVX implementation
31 
32 #ifndef _AK_SIMD_AVX_H_
33 #define _AK_SIMD_AVX_H_
34 
37 
38 #if defined(AKSIMD_AVX_SUPPORTED)
39 
40 #include <immintrin.h>
41 
42 ////////////////////////////////////////////////////////////////////////
43 /// @name AKSIMD types
44 //@{
45 
46 typedef float AKSIMD_F32; ///< 32-bit float
47 typedef __m256 AKSIMD_V8F32; ///< Vector of 8 32-bit floats
48 typedef __m256d AKSIMD_V4F64; ///< Vector of 4 64-bit floats
49 typedef __m256i AKSIMD_V8I32; ///< Vector of 8 32-bit signed integers
50 typedef AKSIMD_V8F32 AKSIMD_V8COND; ///< Vector of 8 comparison results
51 typedef AKSIMD_V8F32 AKSIMD_V8FCOND; ///< Vector of 8 comparison results
52 typedef AKSIMD_V8I32 AKSIMD_V8ICOND;
53 
54 //@}
55 ////////////////////////////////////////////////////////////////////////
56 
57 
58 ////////////////////////////////////////////////////////////////////////
59 /// @name AKSIMD loading / setting
60 //@{
61 
62 /// Loads eight single-precision floating-point values from memory.
63 /// The address does not need to be 32-byte aligned (see _mm_loadu_ps).
64 /// On every modern x86 processor this performs the same as an aligned load.
65 #define AKSIMD_LOAD_V8F32( __addr__ ) _mm256_loadu_ps( (AkReal32*)(__addr__) )
66 
67 /// Loads a single single-precision, floating-point value, copying it into
68 /// all eight words (see _mm_load1_ps, _mm_load_ps1)
69 #define AKSIMD_LOAD1_V8F32( __scalar__ ) _mm256_broadcast_ss( &(__scalar__) )
70 
71 /// Loads a single double-precision, floating-point value, and copies it into
72 /// all elements of the vector (see _mm_load_pd1)
73 #define AKSIMD_LOAD1_V4F64( __scalar__ ) _mm256_castpd_ps(_mm256_broadcast_sd( &(__scalar__) ))
74 
75 /// Sets the eight single-precision, floating-point values to in_value (see
76 /// _mm_set1_ps, _mm_set_ps1)
77 #define AKSIMD_SET_V8F32( __scalar__ ) _mm256_set1_ps( (__scalar__) )
78 
79 /// Populates the full vector with the 8 floating point values provided
80 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
81 
82 /// Populates the full vector with the 4 double-prec floating point values provided
83 #define AKSIMD_SETV_V4F64( _d, _c, _b, _a ) _mm256_castpd_ps( _mm256_set_pd( (_d), (_c), (_b), (_a) ) )
84 
85 /// Sets the eight single-precision, floating-point values to zero (see
86 /// _mm_setzero_ps)
87 #define AKSIMD_SETZERO_V8F32() _mm256_setzero_ps()
88 
89 /// Loads a single-precision, floating-point value into the low word
90 /// and clears the upper seven words.
91 /// r0 := *p; r1...r7 := 0.0 (see _mm_load_ss)
92 #define AKSIMD_LOAD_SS_V8F32( __addr__ ) _mm256_zextps128_ps256(_mm_load_ss( (__addr__) ))
93 
94 /// Performs a masked load of the target address, where if the high-bit is set on the mask (should be AKSIMD_V4COND)
95 /// then value will be loaded from the address. Otherwise, value will be set to zero
96 #define AKSIMD_MASKLOAD_V4F32( __addr__, __mask__ ) _mm_maskload_ps( __addr__, _mm_castps_si128(__mask__) )
97 
98 /// Performs a masked load of the target address, where if the high-bit is set on the mask (should be AKSIMD_V8COND)
99 /// then value will be loaded from the address. Otherwise, value will be set to zero
100 #define AKSIMD_MASKLOAD_V8F32( __addr__, __mask__ ) _mm256_maskload_ps( __addr__, _mm256_castps_si256(__mask__) )
101 
102 
103 /// Loads the two m128i's provided into the output m256i a
104 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
105 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
106 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them
107 #define AKSIMD_SETV_V2F128( m2, m1) _mm256_set_m128(m2, m1)
108 
109 #define AKSIMD_INSERT_V2F128( a, m128, idx) _mm256_insertf128_ps(a, m128, idx)
110 
111 #define AKSIMD_GETELEMENT_V8F32( __vName, __num__ ) ((AkReal32*)&(__vName))[(__num__)]
112 #define AKSIMD_GETELEMENT_V4F64( __vName, __num__ ) ((AkReal64*)&(__vName))[(__num__)]
113 #define AKSIMD_GETELEMENT_V8I32( __vName, __num__ ) ((AkInt32*)&(__vName))[(__num__)]
114 #define AKSIMD_GETELEMENT_V4I64( __vName, __num__ ) ((AkInt64*)&(__vName))[(__num__)]
115 
116 //@}
117 ////////////////////////////////////////////////////////////////////////
118 
119 
120 ////////////////////////////////////////////////////////////////////////
121 /// @name AKSIMD storing
122 //@{
123 
124 /// Stores eight single-precision, floating-point values.
125 /// The address does not need to be 32-byte aligned (see _mm_storeu_ps).
126 /// On every modern x86 processor this performs the same as an aligned store.
127 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
128 
129 /// Stores the lower single-precision, floating-point value.
130 /// *p := a0 (see _mm_store_ss)
131 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) )
132 
133 /// Performs a masked store of the target address, where if the high-bit is set on the mask (should be AKSIMD_V4COND)
134 /// then value will be stored to the address. Otherwise, destination memory is not modified
135 #define AKSIMD_MASKSTORE_V4F32( __addr__, __mask__, __vec__ ) _mm_maskstore_ps( __addr__, _mm_castps_si128(__mask__), __vec__ )
136 
137 /// Performs a masked load of the target address, where if the high-bit is set on the mask (should be AKSIMD_V8COND)
138 /// then value will be stored to the address. Otherwise, destination memory is not modified
139 #define AKSIMD_MASKSTORE_V8F32( __addr__, __mask__, __vec__ ) _mm256_maskstore_ps( __addr__, _mm256_castps_si256(__mask__), __vec__ )
140 
141 //@}
142 ////////////////////////////////////////////////////////////////////////
143 
144 ////////////////////////////////////////////////////////////////////////
145 /// @name AKSIMD shuffling
146 //@{
147 
148 /// Selects eight specific single-precision, floating-point values from
149 /// a and b, based on the mask i within 128-bit lanes (see _mm256_shuffle_ps)
150 /// This means that the AKSIMD_SHUFFLE operand still picks 1 of 4 32b components
151 /// inside of each of the 2 128b lanes.
152 // Usage: AKSIMD_SHUFFLE_V8F32( vec1, vec2, AKSIMD_SHUFFLE( z, y, x, w ) )
153 #define AKSIMD_SHUFFLE_V8F32( a, b, i ) _mm256_shuffle_ps( a, b, i )
154 
155 /// For each 128b lane, Swap the 2 lower floats together and the 2 higher floats together. ( h g f e d c b a -> g h e f c d a b )
156 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
157 
158 /// For each 128b lane, Swap the 2 lower floats with the 2 higher floats. ( h g f e d c b a -> f e h g b a d c )
159 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
160 
161 /// For each 128b lane, barrel-shift all floats by one. ( h g f e d c b a -> e h g f a d c b )
162 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
163 
164 /// For each 128b lane, duplicates the odd items into the even items ( h g f e d c b a -> h h f f d d b b )
165 #define AKSIMD_DUP_V8_ODD(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
166 
167 /// For each 128b lane, duplicates the even items into the odd items ( h g f e d c b a -> g g e e c c a a )
168 #define AKSIMD_DUP_V8_EVEN(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
169 
170 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in i, and return the results
171 #define AKSIMD_SHUFFLE_V8I32( a, b, i ) _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), i ))
172 
173 /// single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
174 #define AKSIMD_PERMUTEVAR_V8F32(a, b) _mm256_permutevar_ps(a, b)
175 
176 // Macro for selection parameter for AKSIMD_PERMUTE_2X128_V8F32()
177 #define AKSIMD_PERMUTE128( l1, l0 ) (((l1) << 4) | (l0))
178 
179 /// For each 128b lane, select one of the four input 128b lanes across a and b,
180 /// based on the mask i. AKSIMD_SHUFFLE can still be directly used as a control
181 #define AKSIMD_PERMUTE_2X128_V8F32( a, b, i ) _mm256_permute2f128_ps(a, b, i)
182 
183 /// Selects the lower of each of the 128b lanes in a and b to be the result ( B A ), ( D C ) -> ( C A )
184 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0))
185 
186 /// Selects the higher of each of the 128b lanes in a and b to be the result ( B A ), ( D C) -> ( D B )
187 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1))
188 
189 /// Gets the specified 128b lane from a and stores it in the result
190 #define AKSIMD_EXTRACT_V2F128( a, i ) _mm256_extractf128_ps(a, i)
191 
192 /// Rotate the 4x4 vectors in each of the 128b lanes. After rotation:
193 /// A[7:0] = D[4] C[4] B[4] A[4] D[0] C[0] B[0] A[0]
194 /// B[7:0] = D[5] C[5] B[5] A[5] D[1] C[1] B[1] A[1]
195 /// C[7:0] = D[6] C[6] B[6] A[6] D[2] C[2] B[2] A[2]
196 /// D[7:0] = D[7] C[7] B[7] A[7] D[3] C[3] B[3] A[3]
197 static AkForceInline void AKSIMD_TRANSPOSE8X4_V8F32(AKSIMD_V8F32& A, AKSIMD_V8F32& B, AKSIMD_V8F32& C, AKSIMD_V8F32& D)
198 {
199  AKSIMD_V8F32 tmp1, tmp2, tmp3, tmp4;
200  tmp1 = AKSIMD_SHUFFLE_V8F32(A, B, AKSIMD_SHUFFLE(1,0,1,0));
201  tmp2 = AKSIMD_SHUFFLE_V8F32(A, B, AKSIMD_SHUFFLE(3,2,3,2));
202  tmp3 = AKSIMD_SHUFFLE_V8F32(C, D, AKSIMD_SHUFFLE(1,0,1,0));
203  tmp4 = AKSIMD_SHUFFLE_V8F32(C, D, AKSIMD_SHUFFLE(3,2,3,2));
204 
205  A = AKSIMD_SHUFFLE_V8F32(tmp1, tmp3, AKSIMD_SHUFFLE(2, 0, 2, 0));
206  B = AKSIMD_SHUFFLE_V8F32(tmp1, tmp3, AKSIMD_SHUFFLE(3, 1, 3, 1));
207  C = AKSIMD_SHUFFLE_V8F32(tmp2, tmp4, AKSIMD_SHUFFLE(2, 0, 2, 0));
208  D = AKSIMD_SHUFFLE_V8F32(tmp2, tmp4, AKSIMD_SHUFFLE(3, 1, 3, 1));
209 }
210 
211 //@}
212 ////////////////////////////////////////////////////////////////////////
213 
214 
215 ////////////////////////////////////////////////////////////////////////
216 /// @name AKSIMD arithmetic
217 //@{
218 
219 /// Subtracts the eight single-precision, floating-point values of
220 /// a and b (a - b) (see _mm_sub_ps)
221 #define AKSIMD_SUB_V8F32( a, b ) _mm256_sub_ps( a, b )
222 
223 /// Subtracts the lower single-precision, floating-point values of a and b.
224 /// The upper three single-precision, floating-point values are passed through from a.
225 /// r0 := a0 - b0 ; r1...r7 := a1...a7 (see _mm_sub_ss)
226 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
227 
228 /// Adds the eight single-precision, floating-point values of
229 /// a and b (see _mm_add_ps)
230 #define AKSIMD_ADD_V8F32( a, b ) _mm256_add_ps( a, b )
231 
232 /// Performs alternatiing subs and adds of the eight single-precision,
233 /// floating-point values of a and b (see _mm_addsub_ps)
234 #define AKSIMD_ADDSUB_V8F32( a, b ) _mm256_addsub_ps( a, b )
235 
236 /// Adds the lower single-precision, floating-point values of a and b; the
237 /// upper three single-precision, floating-point values are passed through from a.
238 /// r0 := a0 + b0; r1...r7 := a1...a7 (see _mm_add_ss)
239 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
240 
241 /// Multiplies the eight single-precision, floating-point values
242 /// of a and b (see _mm_mul_ps)
243 #define AKSIMD_MUL_V8F32( a, b ) _mm256_mul_ps( a, b )
244 
245 #define AKSIMD_DIV_V8F32( a, b ) _mm256_div_ps( a, b )
246 
247 /// Multiplies the lower single-precision, floating-point values of
248 /// a and b; the upper three single-precision, floating-point values
249 /// are passed through from a.
250 /// r0 := a0 * b0; r1...r7 := a1...a7 (see _mm_mul_ss)
251 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) )
252 
253 /// Computes the minima of the eight single-precision, floating-point
254 /// values of a and b (see _mm_min_ps)
255 #define AKSIMD_MIN_V8F32( a, b ) _mm256_min_ps( a, b )
256 
257 /// Computes the maximums of the eight single-precision, floating-point
258 /// values of a and b (see _mm_max_ps)
259 #define AKSIMD_MAX_V8F32( a, b ) _mm256_max_ps( a, b )
260 
261 /// Computes the absolute value
262 #define AKSIMD_ABS_V8F32( a ) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
263 
264 /// Changes the sign
265 #define AKSIMD_NEG_V8F32( __a__ ) _mm256_xor_ps(_mm256_set1_ps(-0.f), __a__)
266 
267 /// Vector square root aproximation (see _mm_sqrt_ps)
268 #define AKSIMD_SQRT_V8F32( __a__ ) _mm256_sqrt_ps( (__a__) )
269 
270 /// Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
271 #define AKSIMD_RSQRT_V8F32( __a__ ) _mm256_rsqrt_ps( (__a__) )
272 
273 /// Vector reciprocal
274 #define AKSIMD_RECIP_V8F32( __a__ ) _mm256_rcp_ps( (__a__) )
275 
276 /// Vector ceil
277 #define AKSIMD_CEIL_V8F32( __a__ ) _mm256_ceil_ps( (__a__) )
278 
279 #define AKSIMD_XOR_V8F32( a, b ) _mm256_xor_ps(a,b)
280 #define AKSIMD_OR_V8F32( a, b ) _mm256_or_ps(a,b)
281 #define AKSIMD_AND_V8F32( a, b) _mm256_and_ps(a,b)
282 #define AKSIMD_NOT_V8F32( a ) _mm256_xor_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(~0)))
283 #define AKSIMD_ANDNOT_V8F32( a, b ) _mm256_andnot_ps(a, b)
284 
285 /// horizontal add across the entire vector - vVec will be updated to contain the sum of every input element of vVec
286 /// \akwarning
287 /// Don't expect this to be very efficient.
288 /// \endakwarning
289 static AkForceInline AKSIMD_V8F32 AKSIMD_HORIZONTALADD_V8F32(AKSIMD_V8F32 vVec)
290 {
291  __m256 vAb = _mm256_shuffle_ps(vVec, vVec, 0xB1);
292  __m256 vHaddAb = _mm256_add_ps(vVec, vAb);
293  __m256 vHaddCd = _mm256_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
294  __m256 vHaddAbcd = _mm256_add_ps(vHaddAb, vHaddCd);
295  __m256 vHaddEfgh = _mm256_permute2f128_ps(vHaddAbcd, vHaddAbcd, 0x01);
296  __m256 vHaddAll = _mm256_add_ps(vHaddAbcd, vHaddEfgh);
297  return vHaddAll;
298 }
299 
300 /// Cross-platform SIMD multiplication of 8 complex data elements with interleaved real and imaginary parts
301 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(const AKSIMD_V8F32 cIn1, const AKSIMD_V8F32 cIn2)
302 {
303  __m256 real1Ext = _mm256_moveldup_ps(cIn1); // reals extended (a3, a3, a2, a2, a1, a1, a0, a0)
304  __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1); // shuf multiplicand (c3, d3, c2, d2, c1, d1, c0, d0)
305  __m256 imag1Ext = _mm256_movehdup_ps(cIn1); // multiplier imag (b3, b3, b2, b2, b1, b1, b0, b0)
306  __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf); // temp (b3c3, b3d3, b2c2, b2d2, b1c1, b1d1, b0c0, b0d0)
307  __m256 mul = _mm256_mul_ps(real1Ext, cIn2); // (a3d3, a3c3, a2d2, a2c2, a1d1, a1c1, a0d0, a0c0)
308  __m256 out = _mm256_addsub_ps(mul, temp); // final (a3d3+b3c3, a3c3-b3d3, a2d2+b2c2, a2c2-b2d2, a1d1+b1c1, a1c1-b1d1, a0d0+b0c0, a0c0-b0d0)
309  return out;
310 }
311 
312 //@}
313 ////////////////////////////////////////////////////////////////////////
314 
315 
316 ////////////////////////////////////////////////////////////////////////
317 /// @name AKSIMD packing / unpacking
318 //@{
319 
320 /// Selects and interleaves the lower two single-precision, floating-point
321 /// values from each 128-bit lane in a and b (see _mm_unpacklo_ps)
322 /// i.e. r0 := a0, r1 := b0, r2 := a1, r3 := b1, r4 := a4, r5 := b4, r6 := a5, r7 := b5
323 #define AKSIMD_UNPACKLO_V8F32( a, b ) _mm256_unpacklo_ps( a, b )
324 
325 /// Selects and interleaves the upper two single-precision, floating-point
326 /// values from each 128-bit lane a and b (see _mm_unpackhi_ps)
327 /// i.e. r0 := a2, r1 := b2, r2 := a3, r3 := b3, r4 := a6, r5 := b6, r6 := a7, r7 := b7
328 #define AKSIMD_UNPACKHI_V8F32( a, b ) _mm256_unpackhi_ps( a, b )
329 
330 //@}
331 ////////////////////////////////////////////////////////////////////////
332 
333 ////////////////////////////////////////////////////////////////////////
334 /// @name AKSIMD vector comparison
335 //@{
336 
337 #define AKSIMD_CMP_CTRLMASKV8 __m256
338 
339 /// Vector "<=" operation (see _mm_cmple_ps)
340 #define AKSIMD_LTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LE_OS )
341 
342 #define AKSIMD_LT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LT_OS )
343 
344 /// Vector ">=" operation (see _mm_cmple_ps)
345 #define AKSIMD_GTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GE_OS )
346 
347 #define AKSIMD_GT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GT_OS )
348 
349 /// Vector "==" operation (see _mm_cmpeq_ps)
350 #define AKSIMD_EQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_EQ_OS )
351 
352 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
353 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
354 {
355  return _mm256_blendv_ps(vA, vB, vMask);
356 }
357 
358 // (cond1 >= cond2) ? b : a.
359 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) )
360 
361 // a >= 0 ? b : c ... Written, like, you know, the normal C++ operator syntax.
362 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) )
363 
364 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
365 
366 #define AKSIMD_MASK_V8F32( __a__ ) _mm256_movemask_ps( __a__ )
367 
368 // returns true if every element of the provided vector is zero
369 #define AKSIMD_TESTZERO_V8I32( __a__ ) (_mm256_testz_si256(__a__,__a__) != 0)
370 #define AKSIMD_TESTZERO_V8F32( __a__) AKSIMD_TESTZERO_V8I32(_mm256_castps_si256(__a__))
371 
372 // returns true if every element of the provided vector is one
373 #define AKSIMD_TESTONES_V8I32(__a__) (_mm256_testc_si256(__a__, _mm256_set1_epi32(~0)) != 0)
374 #define AKSIMD_TESTONES_V8F32( __a__) AKSIMD_TESTONES_V8I32(_mm256_castps_si256(__a__))
375 
376 //@}
377 ////////////////////////////////////////////////////////////////////////
378 
379 /// Loads 256-bit value (see _mm_loadu_si128)
380 /// On every modern x86 processor this performs the same as an aligned load.
381 #define AKSIMD_LOAD_V8I32( __addr__ ) _mm256_loadu_si256( (__addr__) )
382 
383 /// Sets the eight 32-bit integer values to zero (see _mm_setzero_si128)
384 #define AKSIMD_SETZERO_V8I32() _mm256_setzero_si256()
385 
386 /// Sets the provided scalar value at the first index of the vector, and zeroes everything else
387 #define AKSIMD_SET_V8I32( __scalar__ ) _mm256_set1_epi32( (__scalar__) )
388 
389 /// Populates the full vector with the 8 values provided
390 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
391 
392 /// Loads the two m128i's provided into the output m256i a
393 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
394 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
395 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them
396 #define AKSIMD_SET_V2I128(m1, m2) _mm256_setr_m128i(m1, m2)
397 
398 /// Stores eight 32-bit integer values.
399 /// The address does not need to be 32-byte aligned (see _mm_storeu_si128).
400 /// On every modern x86 processor this performs the same as an aligned load.
401 #define AKSIMD_STORE_V8I32( __addr__, __vec__ ) _mm256_storeu_si256( (__addr__), (__vec__) )
402 
403 ////////////////////////////////////////////////////////////////////////
404 /// @name AKSIMD conversion
405 //@{
406 
407 /// Converts the eight signed 32-bit integer values of a to single-precision,
408 /// floating-point values (see _mm_cvtepi32_ps)
409 #define AKSIMD_CONVERT_V8I32_TO_V8F32( __vec__ ) _mm256_cvtepi32_ps( (__vec__) )
410 
411 /// Converts the eight single-precision, floating-point values of a to signed
412 /// 32-bit integer values by rounding (see _mm_cvtps_epi32)
413 #define AKSIMD_ROUND_V8F32_TO_V8I32( __vec__ ) _mm256_cvtps_epi32( (__vec__) )
414 
415 /// Converts the eight single-precision, floating-point values of a to signed
416 /// 32-bit integer values by truncating (see _mm_cvttps_epi32)
417 #define AKSIMD_TRUNCATE_V8F32_TO_V8I32( __vec__ ) _mm256_cvttps_epi32( (__vec__) )
418 
419 /// Converts the eight half-precision floating-point values of vec to
420 /// eight full-precision floating-point values
421 /// WARNING: Using this requires F16C support, which is not guaranteed on AVX
422 #define AKSIMD_CONVERT_V8F16_TO_V8F32( __vec__ ) _mm256_cvtph_ps( (__vec__) )
423 
424 /// Converts the eight single-precision, floating-point values of vec to
425 /// eight half-precision floating-point values
426 /// WARNING: Using this requires F16C support, which is not guaranteed on AVX
427 #define AKSIMD_CONVERT_V8F32_TO_V8F16( __vec__ ) _mm256_cvtps_ph(__vec__, (_MM_FROUND_TO_NEAREST_INT ) )
428 
429 //@}
430 ////////////////////////////////////////////////////////////////////////
431 
432 ////////////////////////////////////////////////////////////////////////
433 /// @name AKSIMD cast
434 //@{
435 
436 /// Cast vector of type AKSIMD_V4F64 to type AKSIMD_V8F32. This intrinsic is only
437 /// used for compilation and does not generate any instructions, thus it has zero latency.
438 #define AKSIMD_CAST_V4F64_TO_V8F32( __vec__ ) _mm256_castpd_ps(__vec__)
439 
440 /// Cast vector of type AKSIMD_V4F64 to type AKSIMD_V8I32. This intrinsic is only
441 /// used for compilation and does not generate any instructions, thus it has zero latency.
442 #define AKSIMD_CAST_V4F64_TO_V8I32( __vec__ ) _mm256_castpd_si256(__vec__)
443 
444 /// Cast vector of type AKSIMD_V8F32 to type AKSIMD_V4F64. This intrinsic is only
445 /// used for compilation and does not generate any instructions, thus it has zero latency.
446 #define AKSIMD_CAST_V8F32_TO_V4F64( __vec__ ) _mm256_castps_pd(__vec__)
447 
448 /// Cast vector of type AKSIMD_V8F32 to type AKSIMD_V8I32. This intrinsic is only
449 /// used for compilation and does not generate any instructions, thus it has zero latency.
450 #define AKSIMD_CAST_V8F32_TO_V8I32( __vec__ ) _mm256_castps_si256(__vec__)
451 
452 /// Cast vector of type AKSIMD_V8I32 to type AKSIMD_V4F64. This intrinsic is only
453 /// used for compilation and does not generate any instructions, thus it has zero latency.
454 #define AKSIMD_CAST_V8I32_TO_V4F64( __vec__ ) _mm256_castsi256_pd(__vec__)
455 
456 /// Cast vector of type AKSIMD_V8I32 to type AKSIMD_V8F32. This intrinsic is only
457 /// used for compilation and does not generate any instructions, thus it has zero latency.
458 #define AKSIMD_CAST_V8I32_TO_V8F32( __vec__ ) _mm256_castsi256_ps(__vec__)
459 
460 /// Cast vector of type AKSIMD_V8COND to AKSIMD_V8F32.
461 #define AKSIMD_CAST_V8COND_TO_V8F32( __vec__ ) (__vec__)
462 
463 /// Cast vector of type AKSIMD_V8F32 to AKSIMD_V8COND.
464 #define AKSIMD_CAST_V8F32_TO_V8COND( __vec__ ) (__vec__)
465 
466 /// Cast vector of type AKSIMD_V8COND to AKSIMD_V8I32.
467 #define AKSIMD_CAST_V8COND_TO_V8I32( __vec__ ) _mm256_castps_si256(__vec__)
468 
469 /// Cast vector of type AKSIMD_V8I32 to AKSIMD_V8COND.
470 #define AKSIMD_CAST_V8I32_TO_V8COND( __vec__ ) _mm256_castsi256_ps(__vec__)
471 
472 //@}
473 ////////////////////////////////////////////////////////////////////////
474 #endif //_AK_SIMD_AVX_H_
475 #endif
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:513
#define AkForceInline
Definition: AkTypes.h:63
float32_t AKSIMD_F32
32-bit float
Definition: AkSimdTypes.h:60

このページはお役に立ちましたか?

サポートは必要ですか?

ご質問や問題、ご不明点はございますか?お気軽にお問い合わせください。

サポートページをご確認ください

あなたのプロジェクトについて教えてください。ご不明な点はありませんか。

プロジェクトを登録していただくことで、ご利用開始のサポートをいたします。

Wwiseからはじめよう