Version
menu_open
Wwise SDK 2023.1.4
AkSimdMath.h
Go to the documentation of this file.
1 /***********************************************************************
2  The content of this file includes source code for the sound engine
3  portion of the AUDIOKINETIC Wwise Technology and constitutes "Level
4  Two Source Code" as defined in the Source Code Addendum attached
5  with this file. Any use of the Level Two Source Code shall be
6  subject to the terms and conditions outlined in the Source Code
7  Addendum and the End User License Agreement for Wwise(R).
8 
9  Copyright (c) 2024 Audiokinetic Inc.
10  ***********************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////
13 //
14 // AkSimdMath.h
15 //
16 // Library of static functions for math computations with SIMD in mind.
17 //
18 //////////////////////////////////////////////////////////////////////
19 #ifndef _AKSIMDMATH_H_
20 #define _AKSIMDMATH_H_
21 
24 #include <AkMath.h>
25 
26 namespace AkMath
27 {
28  //Take 4 vectors <x,y,z> and return <x,x,x,x>, <y,y,y,y> and <z,z,z,z>
29  AkForceInline void PermuteVectors3(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
30  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz)
31  {
32  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
33  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
34  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
35  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
36 
37  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
38  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
39  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
40  }
41 
42  //Take 3 vectors <x3,x2,x1,x0>, <y,y,y,y> and <z,z,z,z> and return 4 vectors <x,y,z,z>
43  AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32& xxxx, const AKSIMD_V4F32& yyyy, const AKSIMD_V4F32& zzzz,
44  AKSIMD_V4F32& out_v0, AKSIMD_V4F32& out_v1, AKSIMD_V4F32& out_v2, AKSIMD_V4F32& out_v3)
45  {
46  /*__m128 _mm_shuffle_ps(__m128 lo, __m128 hi, _MM_SHUFFLE(hi3, hi2, lo1, lo0))
47  Interleave inputs into low 2 floats and high 2 floats of output.Basically
48  out[0] = lo[lo0];
49  out[1] = lo[lo1];
50  out[2] = hi[hi2];
51  out[3] = hi[hi3];
52  */
53 
54  AKSIMD_V4F32 x0x1y0y1 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(1, 0, 1, 0));
55  AKSIMD_V4F32 z0z1z0z1 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(1, 0, 1, 0));
56 
57  out_v0 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(2, 0, 2, 0));
58  out_v1 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(3, 1, 3, 1));
59 
60  AKSIMD_V4F32 x2x3y2y3 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(3, 2, 3, 2));
61  AKSIMD_V4F32 z2z3z2z3 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(3, 2, 3, 2));
62 
63  out_v2 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(2, 0, 2, 0));
64  out_v3 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(3, 1, 3, 1));
65  }
66 
67  //Take 4 vectors <x,y,z,w> and return <x,x,x,x>, <y,y,y,y>, <z,z,z,z> and <w,w,w,w>
68  AkForceInline void PermuteVectors4(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
69  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz, AKSIMD_V4F32& out_wwww)
70  {
71  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
72  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
73  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
74  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
75 
76  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
77  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
78  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
79  out_wwww = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(3, 1, 3, 1));
80  }
81 
82  // 3-element dot product of 4 vectors.
84  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
85  {
86  return AKSIMD_ADD_V4F32(AKSIMD_ADD_V4F32(AKSIMD_MUL_V4F32(v0_x, v1_x), AKSIMD_MUL_V4F32(v0_y, v1_y)), AKSIMD_MUL_V4F32(v0_z, v1_z));
87  }
88 
89  // 3-element dot product of 1 common vector with 4 vectors
90  AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
91  {
92  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
93  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
94  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
95  return DotPoduct3_4x4(v0_x, v0_y, v0_z, v1_x, v1_y, v1_z);
96  }
97 
98  // 4-element dot product of 4 vectors.
99  AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32& v0_x, const AKSIMD_V4F32& v0_y, const AKSIMD_V4F32& v0_z, const AKSIMD_V4F32& v0_w,
100  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
101  {
102  return AKSIMD_ADD_V4F32(
104  AKSIMD_MUL_V4F32(v0_x, v1_x),
105  AKSIMD_MUL_V4F32(v0_y, v1_y)),
107  AKSIMD_MUL_V4F32(v0_z, v1_z),
108  AKSIMD_MUL_V4F32(v0_w, v1_w)));
109  }
110 
111  // 4-element dot product of 1 common vector with 4 vectors
112  AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
113  {
114  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
115  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
116  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
117  const AKSIMD_V4F32 v0_w = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
118  return DotPoduct4_4x4(v0_x, v0_y, v0_z, v0_w, v1_x, v1_y, v1_z, v1_w);
119  }
120 
121  // 3-element cross product of 4 vectors, returned as XXXX, YYYY, ZZZZ
123  const AKSIMD_V4F32& u_x, const AKSIMD_V4F32& u_y, const AKSIMD_V4F32& u_z,
124  const AKSIMD_V4F32& v_x, const AKSIMD_V4F32& v_y, const AKSIMD_V4F32& v_z,
125  AKSIMD_V4F32& uXv_x, AKSIMD_V4F32& uXv_y, AKSIMD_V4F32& uXv_z
126  )
127  {
128  uXv_x = AKSIMD_SUB_V4F32(AKSIMD_MUL_V4F32(u_y, v_z), AKSIMD_MUL_V4F32(u_z, v_y));
129  uXv_y = AKSIMD_SUB_V4F32(AKSIMD_MUL_V4F32(u_z, v_x), AKSIMD_MUL_V4F32(u_x, v_z));
130  uXv_z = AKSIMD_SUB_V4F32(AKSIMD_MUL_V4F32(u_x, v_y), AKSIMD_MUL_V4F32(u_y, v_x));
131  }
132 
133  // Trig functions approximation (based on the Fast versions found in AkMath.h)
135  {
136  const AKSIMD_V4F32 B = AKSIMD_SET_V4F32(4 / PI);
137  const AKSIMD_V4F32 C = AKSIMD_SET_V4F32(-4 / (PI * PI));
138  const AKSIMD_V4F32 P = AKSIMD_SET_V4F32(0.225f);
139 
140  //float y = B * x + C * x * fabs(x); //float y = X*(B+C*fabs(x));
141 
143  y = AKSIMD_MADD_V4F32(y, C, B);
144  y = AKSIMD_MUL_V4F32(y, x);
145 
146  // return P * (y * fabs(y) - y) + y;
147  AKSIMD_V4F32 sine = AKSIMD_ABS_V4F32(y);
148  sine = AKSIMD_MSUB_V4F32(y, sine, y);
149  sine = AKSIMD_MADD_V4F32(sine, P, y);
150  return sine;
151  }
152 
154  {
155  //Compute the offset needed for the cosinus. If you compare with FastCos, the constants have been combined.
156  const AKSIMD_V4F32 offsetNoWrap = AKSIMD_SET_V4F32(PI / 2); // cos = sin(x+pi/2)
157  const AKSIMD_V4F32 offsetWrap = AKSIMD_SET_V4F32(PI / 2 - 2 * PI); // Wrap: cos(x) = cos(x - 2 pi)
158  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
159 
160  // (cond1 >= cond2) ? a : b
161  AKSIMD_V4F32 offset = AKSIMD_SEL_GTEZ_V4F32(AKSIMD_SUB_V4F32(x, vHalfPI), offsetWrap, offsetNoWrap);
162  return AKSIMD_SIN_V4F32(AKSIMD_ADD_V4F32(x, offset));
163  }
164 
166  {
167  const AKSIMD_V4F32 vNeg = AKSIMD_SET_V4F32(-1.0f);
168  const AKSIMD_V4F32 vOne = AKSIMD_SET_V4F32(1.0f);
169  const AKSIMD_V4F32 vZero = AKSIMD_SET_V4F32(0.0f);
170  const AKSIMD_V4F32 vK = AKSIMD_SET_V4F32(0.28f);
171  const AKSIMD_V4F32 vKRepro = AKSIMD_SET_V4F32(1.f / 0.28f);
172  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
173  const AKSIMD_V4F32 vPI = AKSIMD_SET_V4F32(PI);
174  const AKSIMD_V4F32 vEpsilon = AKSIMD_SET_V4F32(1e-20f);
175 
176  //Ensure x is not zero a == 0 ? b : c.
177  x = AKSIMD_VSEL_V4F32(x, vEpsilon, AKSIMD_EQ_V4F32(x, vZero));
178 
179  AKSIMD_V4F32 z = AKSIMD_DIV_V4F32(y, x);
180  AKSIMD_V4F32 absz = AKSIMD_ABS_V4F32(z);
181  AKSIMD_V4COND zcond = AKSIMD_GTEQ_V4F32(vOne, absz);
182 
183  //The approximation is done in 2 segments of the form: offset + z/a*(z*z + b);
184 
185  //if ( fabsf( z ) < 1.0f ) then use .28 for the a coef
186  AKSIMD_V4F32 a = AKSIMD_VSEL_V4F32(vNeg, vK, zcond);
187 
188  //if ( fabsf( z ) < 1.0f ) then use 1 for the b factor, else use 0.28
189  AKSIMD_V4F32 b = AKSIMD_VSEL_V4F32(vK, vKRepro, zcond);
190 
191  AKSIMD_V4F32 atan = AKSIMD_MADD_V4F32(z, z, b);
192  atan = AKSIMD_MUL_V4F32(atan, a);
193  atan = AKSIMD_DIV_V4F32(z, atan);
194 
195  //Adjust for quadrant
196  // zcond x<0 y<0 offset
197  // 1 0 0 0
198  // 1 0 1 0
199  // 1 1 0 +PI
200  // 1 1 1 -PI
201  // 0 0 0 +PI/2
202  // 0 0 1 -PI/2
203  // 0 1 0 +PI/2
204  // 0 1 1 -PI/2
205 
206  AKSIMD_V4F32 offsetByX = AKSIMD_SEL_GTEZ_V4F32(x, vZero, vPI);
207  AKSIMD_V4F32 offset = AKSIMD_VSEL_V4F32(vHalfPI, offsetByX, zcond);
208  AKSIMD_V4F32 sign = AKSIMD_SEL_GTEZ_V4F32(y, vOne, vNeg);
209 
210  //Apply computed offset.
211  atan = AKSIMD_MADD_V4F32(offset, sign, atan);
212  return atan;
213  }
214 
215  //Accepts any positive x. Compare with FastSqrt() which accepts only between ]0,1]
217  {
219  return AKSIMD_GETELEMENT_V4F32(y, 0);
220  }
221 
222  //Compute 1/sqrt(x)
224  {
226  return AKSIMD_GETELEMENT_V4F32(y, 0);
227  }
228 
230  {
232  return AKSIMD_GETELEMENT_V4F32(y, 0);
233  }
234 }
235 
236 #endif //_AKSIMDMATH_H_
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:508
AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:90
AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:112
AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v0_w, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:99
#define AKSIMD_VSEL_V4F32(__a__, __b__, __c__)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:909
#define AKSIMD_GTEQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:888
#define AKSIMD_RSQRT_V4F32(__a__)
Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
Definition: AkSimd.h:700
AkForceInline AKSIMD_V4F32 AKSIMD_COS_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:153
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimdTypes.h:62
AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32 &xxxx, const AKSIMD_V4F32 &yyyy, const AKSIMD_V4F32 &zzzz, AKSIMD_V4F32 &out_v0, AKSIMD_V4F32 &out_v1, AKSIMD_V4F32 &out_v2, AKSIMD_V4F32 &out_v3)
Definition: AkSimdMath.h:43
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:656
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:525
AkForceInline AKSIMD_V4F32 DotPoduct3_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:83
float AkReal32
32-bit floating point
AkForceInline void CrossProduct3_4x4(const AKSIMD_V4F32 &u_x, const AKSIMD_V4F32 &u_y, const AKSIMD_V4F32 &u_z, const AKSIMD_V4F32 &v_x, const AKSIMD_V4F32 &v_y, const AKSIMD_V4F32 &v_z, AKSIMD_V4F32 &uXv_x, AKSIMD_V4F32 &uXv_y, AKSIMD_V4F32 &uXv_z)
Definition: AkSimdMath.h:122
#define AKSIMD_EQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:903
AkForceInline AkReal32 FastRSqrt(AkReal32 x)
Definition: AkSimdMath.h:223
#define AKSIMD_GETELEMENT_V4F32(__vName, __num__)
Get the element at index num in vector __vName.
Definition: AkSimd.h:37
#define AKSIMD_SUB_V4F32(__a__, __b__)
Definition: AkSimd.h:574
AkForceInline AKSIMD_V4F32 AKSIMD_ATAN2_V4F32(AKSIMD_V4F32 y, AKSIMD_V4F32 x)
Definition: AkSimdMath.h:165
#define AKSIMD_MSUB_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:664
#define AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:915
#define AKSIMD_SET_V4F32(__scalar__)
Definition: AkSimd.h:70
AkForceInline void PermuteVectors4(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz, AKSIMD_V4F32 &out_wwww)
Definition: AkSimdMath.h:68
AkForceInline void PermuteVectors3(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz)
Definition: AkSimdMath.h:29
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:626
AkForceInline AkReal32 FastSqrtLarge(AkReal32 x)
Definition: AkSimdMath.h:216
#define AKSIMD_RECIP_V4F32(__a__)
Reciprocal of x (1/x)
Definition: AkSimd.h:706
#define AKSIMD_ABS_V4F32(__a__)
Returns absolute value.
Definition: AkSimd.h:690
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:588
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimdTypes.h:64
AkForceInline AKSIMD_V4F32 AKSIMD_SIN_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:134
AkForceInline AkReal32 FastRcp(AkReal32 x)
Definition: AkSimdMath.h:229
#define AkForceInline
Definition: AkTypes.h:63
#define AKSIMD_SQRT_V4F32(__vec__)
Square root (4 floats)
Definition: AkSimd.h:697
#define AKSIMD_MUL_V4F32(__a__, __b__)
Definition: AkSimd.h:619

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise