Version
menu_open
link
Wwise SDK 2021.1.14
AkSimdMath.h
Go to the documentation of this file.
1 /***********************************************************************
2  The content of this file includes source code for the sound engine
3  portion of the AUDIOKINETIC Wwise Technology and constitutes "Level
4  Two Source Code" as defined in the Source Code Addendum attached
5  with this file. Any use of the Level Two Source Code shall be
6  subject to the terms and conditions outlined in the Source Code
7  Addendum and the End User License Agreement for Wwise(R).
8 
9  Version: v2021.1.14 Build: 6590
10  Copyright (c) 2006-2023 Audiokinetic Inc.
11  ***********************************************************************/
12 
13 //////////////////////////////////////////////////////////////////////
14 //
15 // AkSimdMath.h
16 //
17 // Library of static functions for math computations with SIMD in mind.
18 //
19 //////////////////////////////////////////////////////////////////////
20 #ifndef _AKSIMDMATH_H_
21 #define _AKSIMDMATH_H_
22 
25 #include <AkMath.h>
26 
27 namespace AkMath
28 {
29  //Take 4 vectors <x,y,z> and return <x,x,x,x>, <y,y,y,y> and <z,z,z,z>
30  AkForceInline void PermuteVectors3(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
31  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz)
32  {
33  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
34  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
35  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
36  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
37 
38  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
39  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
40  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
41  }
42 
43  //Take 3 vectors <x3,x2,x1,x0>, <y,y,y,y> and <z,z,z,z> and return 4 vectors <x,y,z,z>
44  AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32& xxxx, const AKSIMD_V4F32& yyyy, const AKSIMD_V4F32& zzzz,
45  AKSIMD_V4F32& out_v0, AKSIMD_V4F32& out_v1, AKSIMD_V4F32& out_v2, AKSIMD_V4F32& out_v3)
46  {
47  /*__m128 _mm_shuffle_ps(__m128 lo, __m128 hi, _MM_SHUFFLE(hi3, hi2, lo1, lo0))
48  Interleave inputs into low 2 floats and high 2 floats of output.Basically
49  out[0] = lo[lo0];
50  out[1] = lo[lo1];
51  out[2] = hi[hi2];
52  out[3] = hi[hi3];
53  */
54 
55  AKSIMD_V4F32 x0x1y0y1 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(1, 0, 1, 0));
56  AKSIMD_V4F32 z0z1z0z1 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(1, 0, 1, 0));
57 
58  out_v0 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(2, 0, 2, 0));
59  out_v1 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(3, 1, 3, 1));
60 
61  AKSIMD_V4F32 x2x3y2y3 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(3, 2, 3, 2));
62  AKSIMD_V4F32 z2z3z2z3 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(3, 2, 3, 2));
63 
64  out_v2 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(2, 0, 2, 0));
65  out_v3 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(3, 1, 3, 1));
66  }
67 
68  //Take 4 vectors <x,y,z,w> and return <x,x,x,x>, <y,y,y,y>, <z,z,z,z> and <w,w,w,w>
69  AkForceInline void PermuteVectors4(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
70  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz, AKSIMD_V4F32& out_wwww)
71  {
72  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
73  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
74  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
75  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
76 
77  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
78  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
79  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
80  out_wwww = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(3, 1, 3, 1));
81  }
82 
83  // 3-element dot product of 4 vectors.
85  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
86  {
87  return AKSIMD_ADD_V4F32(AKSIMD_ADD_V4F32(AKSIMD_MUL_V4F32(v0_x, v1_x), AKSIMD_MUL_V4F32(v0_y, v1_y)), AKSIMD_MUL_V4F32(v0_z, v1_z));
88  }
89 
90  // 3-element dot product of 1 common vector with 4 vectors
91  AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
92  {
93  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
94  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
95  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
96  return DotPoduct3_4x4(v0_x, v0_y, v0_z, v1_x, v1_y, v1_z);
97  }
98 
99  // 4-element dot product of 4 vectors.
100  AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32& v0_x, const AKSIMD_V4F32& v0_y, const AKSIMD_V4F32& v0_z, const AKSIMD_V4F32& v0_w,
101  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
102  {
103  return AKSIMD_ADD_V4F32(
105  AKSIMD_MUL_V4F32(v0_x, v1_x),
106  AKSIMD_MUL_V4F32(v0_y, v1_y)),
108  AKSIMD_MUL_V4F32(v0_z, v1_z),
109  AKSIMD_MUL_V4F32(v0_w, v1_w)));
110  }
111 
112  // 4-element dot product of 1 common vector with 4 vectors
113  AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
114  {
115  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
116  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
117  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
118  const AKSIMD_V4F32 v0_w = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
119  return DotPoduct4_4x4(v0_x, v0_y, v0_z, v0_w, v1_x, v1_y, v1_z, v1_w);
120  }
121 
122  // Trig functions approximation (based on the Fast versions found in AkMath.h)
124  {
125  const AKSIMD_V4F32 B = AKSIMD_SET_V4F32(4 / PI);
126  const AKSIMD_V4F32 C = AKSIMD_SET_V4F32(-4 / (PI * PI));
127  const AKSIMD_V4F32 P = AKSIMD_SET_V4F32(0.225f);
128 
129  //float y = B * x + C * x * fabs(x); //float y = X*(B+C*fabs(x));
130 
132  y = AKSIMD_MADD_V4F32(y, C, B);
133  y = AKSIMD_MUL_V4F32(y, x);
134 
135  // return P * (y * fabs(y) - y) + y;
136  AKSIMD_V4F32 sine = AKSIMD_ABS_V4F32(y);
137  sine = AKSIMD_MSUB_V4F32(y, sine, y);
138  sine = AKSIMD_MADD_V4F32(sine, P, y);
139  return sine;
140  }
141 
143  {
144  //Compute the offset needed for the cosinus. If you compare with FastCos, the constants have been combined.
145  const AKSIMD_V4F32 offsetNoWrap = AKSIMD_SET_V4F32(PI / 2); // cos = sin(x+pi/2)
146  const AKSIMD_V4F32 offsetWrap = AKSIMD_SET_V4F32(PI / 2 - 2 * PI); // Wrap: cos(x) = cos(x - 2 pi)
147  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
148 
149  // (cond1 >= cond2) ? a : b
150  AKSIMD_V4F32 offset = AKSIMD_SEL_GTEZ_V4F32(AKSIMD_SUB_V4F32(x, vHalfPI), offsetWrap, offsetNoWrap);
151  return AKSIMD_SIN_V4F32(AKSIMD_ADD_V4F32(x, offset));
152  }
153 
155  {
156  const AKSIMD_V4F32 vNeg = AKSIMD_SET_V4F32(-1.0f);
157  const AKSIMD_V4F32 vOne = AKSIMD_SET_V4F32(1.0f);
158  const AKSIMD_V4F32 vZero = AKSIMD_SET_V4F32(0.0f);
159  const AKSIMD_V4F32 vK = AKSIMD_SET_V4F32(0.28f);
160  const AKSIMD_V4F32 vKRepro = AKSIMD_SET_V4F32(1.f / 0.28f);
161  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
162  const AKSIMD_V4F32 vPI = AKSIMD_SET_V4F32(PI);
163  const AKSIMD_V4F32 vEpsilon = AKSIMD_SET_V4F32(1e-20f);
164 
165  //Ensure x is not zero a == 0 ? b : c.
166  x = AKSIMD_VSEL_V4F32(x, vEpsilon, AKSIMD_EQ_V4F32(x, vZero));
167 
168  AKSIMD_V4F32 z = AKSIMD_DIV_V4F32(y, x);
169  AKSIMD_V4F32 absz = AKSIMD_ABS_V4F32(z);
170  AKSIMD_V4COND zcond = AKSIMD_GTEQ_V4F32(vOne, absz);
171 
172  //The approximation is done in 2 segments of the form: offset + z/a*(z*z + b);
173 
174  //if ( fabsf( z ) < 1.0f ) then use .28 for the a coef
175  AKSIMD_V4F32 a = AKSIMD_VSEL_V4F32(vNeg, vK, zcond);
176 
177  //if ( fabsf( z ) < 1.0f ) then use 1 for the b factor, else use 0.28
178  AKSIMD_V4F32 b = AKSIMD_VSEL_V4F32(vK, vKRepro, zcond);
179 
180  AKSIMD_V4F32 atan = AKSIMD_MADD_V4F32(z, z, b);
181  atan = AKSIMD_MUL_V4F32(atan, a);
182  atan = AKSIMD_DIV_V4F32(z, atan);
183 
184  //Adjust for quadrant
185  // zcond x<0 y<0 offset
186  // 1 0 0 0
187  // 1 0 1 0
188  // 1 1 0 +PI
189  // 1 1 1 -PI
190  // 0 0 0 +PI/2
191  // 0 0 1 -PI/2
192  // 0 1 0 +PI/2
193  // 0 1 1 -PI/2
194 
195  AKSIMD_V4F32 offsetByX = AKSIMD_SEL_GTEZ_V4F32(x, vZero, vPI);
196  AKSIMD_V4F32 offset = AKSIMD_VSEL_V4F32(vHalfPI, offsetByX, zcond);
197  AKSIMD_V4F32 sign = AKSIMD_SEL_GTEZ_V4F32(y, vOne, vNeg);
198 
199  //Apply computed offset.
200  atan = AKSIMD_MADD_V4F32(offset, sign, atan);
201  return atan;
202  }
203 
204  //Accepts any positive x. Compare with FastSqrt() which accepts only between ]0,1]
206  {
208  return AKSIMD_GETELEMENT_V4F32(y, 0);
209  }
210 
211  //Compute 1/sqrt(x)
213  {
215  return AKSIMD_GETELEMENT_V4F32(y, 0);
216  }
217 
219  {
221  return AKSIMD_GETELEMENT_V4F32(y, 0);
222  }
223 }
224 
225 #endif //_AKSIMDMATH_H_
float AkReal32
32-bit floating point
Definition: AkTypes.h:70
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:536
AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:91
AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:113
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:73
AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v0_w, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:100
#define AKSIMD_VSEL_V4F32(__a__, __b__, __c__)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:933
#define AKSIMD_GTEQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:912
Definition: AkSimdMath.h:28
#define AKSIMD_RSQRT_V4F32(__a__)
Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
Definition: AkSimd.h:724
AkForceInline AKSIMD_V4F32 AKSIMD_COS_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:142
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:75
AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32 &xxxx, const AKSIMD_V4F32 &yyyy, const AKSIMD_V4F32 &zzzz, AKSIMD_V4F32 &out_v0, AKSIMD_V4F32 &out_v1, AKSIMD_V4F32 &out_v2, AKSIMD_V4F32 &out_v3)
Definition: AkSimdMath.h:44
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:680
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:549
AkForceInline AKSIMD_V4F32 DotPoduct3_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:84
#define AKSIMD_EQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:927
AkForceInline AkReal32 FastRSqrt(AkReal32 x)
Definition: AkSimdMath.h:212
#define AKSIMD_SUB_V4F32(__a__, __b__)
Definition: AkSimd.h:598
AkForceInline AKSIMD_V4F32 AKSIMD_ATAN2_V4F32(AKSIMD_V4F32 y, AKSIMD_V4F32 x)
Definition: AkSimdMath.h:154
#define AKSIMD_MSUB_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:688
#define AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:939
#define AKSIMD_SET_V4F32(__scalar__)
Definition: AkSimd.h:109
AkForceInline void PermuteVectors4(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz, AKSIMD_V4F32 &out_wwww)
Definition: AkSimdMath.h:69
AkForceInline void PermuteVectors3(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz)
Definition: AkSimdMath.h:30
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:650
AkForceInline AkReal32 FastSqrtLarge(AkReal32 x)
Definition: AkSimdMath.h:205
#define AKSIMD_RECIP_V4F32(__a__)
Reciprocal of x (1/x)
Definition: AkSimd.h:730
#define AKSIMD_ABS_V4F32(__a__)
Returns absolute value.
Definition: AkSimd.h:714
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:612
#define AKSIMD_GETELEMENT_V4F32(__vName, __num__)
Retrieve scalar element from vector.
Definition: AkSimd.h:38
AkForceInline AKSIMD_V4F32 AKSIMD_SIN_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:123
AkForceInline AkReal32 FastRcp(AkReal32 x)
Definition: AkSimdMath.h:218
#define AkForceInline
Definition: AkTypes.h:60
#define AKSIMD_SQRT_V4F32(__vec__)
Square root (4 floats)
Definition: AkSimd.h:721
#define AKSIMD_MUL_V4F32(__a__, __b__)
Definition: AkSimd.h:643

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise