Version
menu_open
link
Wwise SDK 2022.1.12
AkSimdMath.h
Go to the documentation of this file.
1 /***********************************************************************
2  The content of this file includes source code for the sound engine
3  portion of the AUDIOKINETIC Wwise Technology and constitutes "Level
4  Two Source Code" as defined in the Source Code Addendum attached
5  with this file. Any use of the Level Two Source Code shall be
6  subject to the terms and conditions outlined in the Source Code
7  Addendum and the End User License Agreement for Wwise(R).
8 
9  Copyright (c) 2024 Audiokinetic Inc.
10  ***********************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////
13 //
14 // AkSimdMath.h
15 //
16 // Library of static functions for math computations with SIMD in mind.
17 //
18 //////////////////////////////////////////////////////////////////////
19 #ifndef _AKSIMDMATH_H_
20 #define _AKSIMDMATH_H_
21 
24 #include <AkMath.h>
25 
26 namespace AkMath
27 {
28  //Take 4 vectors <x,y,z> and return <x,x,x,x>, <y,y,y,y> and <z,z,z,z>
29  AkForceInline void PermuteVectors3(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
30  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz)
31  {
32  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
33  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
34  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
35  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
36 
37  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
38  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
39  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
40  }
41 
42  //Take 3 vectors <x3,x2,x1,x0>, <y,y,y,y> and <z,z,z,z> and return 4 vectors <x,y,z,z>
43  AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32& xxxx, const AKSIMD_V4F32& yyyy, const AKSIMD_V4F32& zzzz,
44  AKSIMD_V4F32& out_v0, AKSIMD_V4F32& out_v1, AKSIMD_V4F32& out_v2, AKSIMD_V4F32& out_v3)
45  {
46  /*__m128 _mm_shuffle_ps(__m128 lo, __m128 hi, _MM_SHUFFLE(hi3, hi2, lo1, lo0))
47  Interleave inputs into low 2 floats and high 2 floats of output.Basically
48  out[0] = lo[lo0];
49  out[1] = lo[lo1];
50  out[2] = hi[hi2];
51  out[3] = hi[hi3];
52  */
53 
54  AKSIMD_V4F32 x0x1y0y1 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(1, 0, 1, 0));
55  AKSIMD_V4F32 z0z1z0z1 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(1, 0, 1, 0));
56 
57  out_v0 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(2, 0, 2, 0));
58  out_v1 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(3, 1, 3, 1));
59 
60  AKSIMD_V4F32 x2x3y2y3 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(3, 2, 3, 2));
61  AKSIMD_V4F32 z2z3z2z3 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(3, 2, 3, 2));
62 
63  out_v2 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(2, 0, 2, 0));
64  out_v3 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(3, 1, 3, 1));
65  }
66 
67  //Take 4 vectors <x,y,z,w> and return <x,x,x,x>, <y,y,y,y>, <z,z,z,z> and <w,w,w,w>
68  AkForceInline void PermuteVectors4(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
69  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz, AKSIMD_V4F32& out_wwww)
70  {
71  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
72  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
73  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
74  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
75 
76  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
77  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
78  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
79  out_wwww = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(3, 1, 3, 1));
80  }
81 
82  // 3-element dot product of 4 vectors.
84  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
85  {
86  return AKSIMD_ADD_V4F32(AKSIMD_ADD_V4F32(AKSIMD_MUL_V4F32(v0_x, v1_x), AKSIMD_MUL_V4F32(v0_y, v1_y)), AKSIMD_MUL_V4F32(v0_z, v1_z));
87  }
88 
89  // 3-element dot product of 1 common vector with 4 vectors
90  AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
91  {
92  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
93  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
94  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
95  return DotPoduct3_4x4(v0_x, v0_y, v0_z, v1_x, v1_y, v1_z);
96  }
97 
98  // 4-element dot product of 4 vectors.
99  AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32& v0_x, const AKSIMD_V4F32& v0_y, const AKSIMD_V4F32& v0_z, const AKSIMD_V4F32& v0_w,
100  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
101  {
102  return AKSIMD_ADD_V4F32(
104  AKSIMD_MUL_V4F32(v0_x, v1_x),
105  AKSIMD_MUL_V4F32(v0_y, v1_y)),
107  AKSIMD_MUL_V4F32(v0_z, v1_z),
108  AKSIMD_MUL_V4F32(v0_w, v1_w)));
109  }
110 
111  // 4-element dot product of 1 common vector with 4 vectors
112  AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
113  {
114  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
115  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
116  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
117  const AKSIMD_V4F32 v0_w = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
118  return DotPoduct4_4x4(v0_x, v0_y, v0_z, v0_w, v1_x, v1_y, v1_z, v1_w);
119  }
120 
121  // Trig functions approximation (based on the Fast versions found in AkMath.h)
123  {
124  const AKSIMD_V4F32 B = AKSIMD_SET_V4F32(4 / PI);
125  const AKSIMD_V4F32 C = AKSIMD_SET_V4F32(-4 / (PI * PI));
126  const AKSIMD_V4F32 P = AKSIMD_SET_V4F32(0.225f);
127 
128  //float y = B * x + C * x * fabs(x); //float y = X*(B+C*fabs(x));
129 
131  y = AKSIMD_MADD_V4F32(y, C, B);
132  y = AKSIMD_MUL_V4F32(y, x);
133 
134  // return P * (y * fabs(y) - y) + y;
135  AKSIMD_V4F32 sine = AKSIMD_ABS_V4F32(y);
136  sine = AKSIMD_MSUB_V4F32(y, sine, y);
137  sine = AKSIMD_MADD_V4F32(sine, P, y);
138  return sine;
139  }
140 
142  {
143  //Compute the offset needed for the cosinus. If you compare with FastCos, the constants have been combined.
144  const AKSIMD_V4F32 offsetNoWrap = AKSIMD_SET_V4F32(PI / 2); // cos = sin(x+pi/2)
145  const AKSIMD_V4F32 offsetWrap = AKSIMD_SET_V4F32(PI / 2 - 2 * PI); // Wrap: cos(x) = cos(x - 2 pi)
146  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
147 
148  // (cond1 >= cond2) ? a : b
149  AKSIMD_V4F32 offset = AKSIMD_SEL_GTEZ_V4F32(AKSIMD_SUB_V4F32(x, vHalfPI), offsetWrap, offsetNoWrap);
150  return AKSIMD_SIN_V4F32(AKSIMD_ADD_V4F32(x, offset));
151  }
152 
154  {
155  const AKSIMD_V4F32 vNeg = AKSIMD_SET_V4F32(-1.0f);
156  const AKSIMD_V4F32 vOne = AKSIMD_SET_V4F32(1.0f);
157  const AKSIMD_V4F32 vZero = AKSIMD_SET_V4F32(0.0f);
158  const AKSIMD_V4F32 vK = AKSIMD_SET_V4F32(0.28f);
159  const AKSIMD_V4F32 vKRepro = AKSIMD_SET_V4F32(1.f / 0.28f);
160  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
161  const AKSIMD_V4F32 vPI = AKSIMD_SET_V4F32(PI);
162  const AKSIMD_V4F32 vEpsilon = AKSIMD_SET_V4F32(1e-20f);
163 
164  //Ensure x is not zero a == 0 ? b : c.
165  x = AKSIMD_VSEL_V4F32(x, vEpsilon, AKSIMD_EQ_V4F32(x, vZero));
166 
167  AKSIMD_V4F32 z = AKSIMD_DIV_V4F32(y, x);
168  AKSIMD_V4F32 absz = AKSIMD_ABS_V4F32(z);
169  AKSIMD_V4COND zcond = AKSIMD_GTEQ_V4F32(vOne, absz);
170 
171  //The approximation is done in 2 segments of the form: offset + z/a*(z*z + b);
172 
173  //if ( fabsf( z ) < 1.0f ) then use .28 for the a coef
174  AKSIMD_V4F32 a = AKSIMD_VSEL_V4F32(vNeg, vK, zcond);
175 
176  //if ( fabsf( z ) < 1.0f ) then use 1 for the b factor, else use 0.28
177  AKSIMD_V4F32 b = AKSIMD_VSEL_V4F32(vK, vKRepro, zcond);
178 
179  AKSIMD_V4F32 atan = AKSIMD_MADD_V4F32(z, z, b);
180  atan = AKSIMD_MUL_V4F32(atan, a);
181  atan = AKSIMD_DIV_V4F32(z, atan);
182 
183  //Adjust for quadrant
184  // zcond x<0 y<0 offset
185  // 1 0 0 0
186  // 1 0 1 0
187  // 1 1 0 +PI
188  // 1 1 1 -PI
189  // 0 0 0 +PI/2
190  // 0 0 1 -PI/2
191  // 0 1 0 +PI/2
192  // 0 1 1 -PI/2
193 
194  AKSIMD_V4F32 offsetByX = AKSIMD_SEL_GTEZ_V4F32(x, vZero, vPI);
195  AKSIMD_V4F32 offset = AKSIMD_VSEL_V4F32(vHalfPI, offsetByX, zcond);
196  AKSIMD_V4F32 sign = AKSIMD_SEL_GTEZ_V4F32(y, vOne, vNeg);
197 
198  //Apply computed offset.
199  atan = AKSIMD_MADD_V4F32(offset, sign, atan);
200  return atan;
201  }
202 
203  //Accepts any positive x. Compare with FastSqrt() which accepts only between ]0,1]
205  {
207  return AKSIMD_GETELEMENT_V4F32(y, 0);
208  }
209 
210  //Compute 1/sqrt(x)
212  {
214  return AKSIMD_GETELEMENT_V4F32(y, 0);
215  }
216 
218  {
220  return AKSIMD_GETELEMENT_V4F32(y, 0);
221  }
222 }
223 
224 #endif //_AKSIMDMATH_H_
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:546
AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:90
AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:112
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:72
AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v0_w, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:99
#define AKSIMD_VSEL_V4F32(__a__, __b__, __c__)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:947
#define AKSIMD_GTEQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:926
#define AKSIMD_RSQRT_V4F32(__a__)
Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
Definition: AkSimd.h:738
AkForceInline AKSIMD_V4F32 AKSIMD_COS_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:141
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:74
AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32 &xxxx, const AKSIMD_V4F32 &yyyy, const AKSIMD_V4F32 &zzzz, AKSIMD_V4F32 &out_v0, AKSIMD_V4F32 &out_v1, AKSIMD_V4F32 &out_v2, AKSIMD_V4F32 &out_v3)
Definition: AkSimdMath.h:43
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:694
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:563
AkForceInline AKSIMD_V4F32 DotPoduct3_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:83
float AkReal32
32-bit floating point
#define AKSIMD_EQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:941
AkForceInline AkReal32 FastRSqrt(AkReal32 x)
Definition: AkSimdMath.h:211
#define AKSIMD_GETELEMENT_V4F32(__vName, __num__)
Get the element at index num in vector __vName.
Definition: AkSimd.h:38
#define AKSIMD_SUB_V4F32(__a__, __b__)
Definition: AkSimd.h:612
AkForceInline AKSIMD_V4F32 AKSIMD_ATAN2_V4F32(AKSIMD_V4F32 y, AKSIMD_V4F32 x)
Definition: AkSimdMath.h:153
#define AKSIMD_MSUB_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:702
#define AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:953
#define AKSIMD_SET_V4F32(__scalar__)
Definition: AkSimd.h:108
AkForceInline void PermuteVectors4(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz, AKSIMD_V4F32 &out_wwww)
Definition: AkSimdMath.h:68
AkForceInline void PermuteVectors3(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz)
Definition: AkSimdMath.h:29
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:664
AkForceInline AkReal32 FastSqrtLarge(AkReal32 x)
Definition: AkSimdMath.h:204
#define AKSIMD_RECIP_V4F32(__a__)
Reciprocal of x (1/x)
Definition: AkSimd.h:744
#define AKSIMD_ABS_V4F32(__a__)
Returns absolute value.
Definition: AkSimd.h:728
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:626
AkForceInline AKSIMD_V4F32 AKSIMD_SIN_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:122
AkForceInline AkReal32 FastRcp(AkReal32 x)
Definition: AkSimdMath.h:217
#define AkForceInline
Definition: AkTypes.h:63
#define AKSIMD_SQRT_V4F32(__vec__)
Square root (4 floats)
Definition: AkSimd.h:735
#define AKSIMD_MUL_V4F32(__a__, __b__)
Definition: AkSimd.h:657

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise