Wwise Version
Wwise SDK 2021.1.6
AkSimdMath.h
Go to the documentation of this file.
1 /***********************************************************************
2  The content of this file includes source code for the sound engine
3  portion of the AUDIOKINETIC Wwise Technology and constitutes "Level
4  Two Source Code" as defined in the Source Code Addendum attached
5  with this file. Any use of the Level Two Source Code shall be
6  subject to the terms and conditions outlined in the Source Code
7  Addendum and the End User License Agreement for Wwise(R).
8 
9  Version: v2021.1.6 Build: 7774
10  Copyright (c) 2006-2022 Audiokinetic Inc.
11  ***********************************************************************/
12 
13 //////////////////////////////////////////////////////////////////////
14 //
15 // AkSimdMath.h
16 //
17 // Library of static functions for math computations with SIMD in mind.
18 //
19 //////////////////////////////////////////////////////////////////////
20 #ifndef _AKSIMDMATH_H_
21 #define _AKSIMDMATH_H_
22 
25 #include <AkMath.h>
26 
27 namespace AkMath
28 {
29  //Take 4 vectors <x,y,z> and return <x,x,x,x>, <y,y,y,y> and <z,z,z,z>
30  AkForceInline void PermuteVectors3(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
31  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz)
32  {
33  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
34  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
35  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
36  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
37 
38  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
39  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
40  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
41  }
42 
43  //Take 3 vectors <x3,x2,x1,x0>, <y,y,y,y> and <z,z,z,z> and return 4 vectors <x,y,z,z>
44  AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32& xxxx, const AKSIMD_V4F32& yyyy, const AKSIMD_V4F32& zzzz,
45  AKSIMD_V4F32& out_v0, AKSIMD_V4F32& out_v1, AKSIMD_V4F32& out_v2, AKSIMD_V4F32& out_v3)
46  {
47  /*__m128 _mm_shuffle_ps(__m128 lo, __m128 hi, _MM_SHUFFLE(hi3, hi2, lo1, lo0))
48  Interleave inputs into low 2 floats and high 2 floats of output.Basically
49  out[0] = lo[lo0];
50  out[1] = lo[lo1];
51  out[2] = hi[hi2];
52  out[3] = hi[hi3];
53  */
54 
55  AKSIMD_V4F32 x0x1y0y1 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(1, 0, 1, 0));
56  AKSIMD_V4F32 z0z1z0z1 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(1, 0, 1, 0));
57 
58  out_v0 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(2, 0, 2, 0));
59  out_v1 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(3, 1, 3, 1));
60 
61  AKSIMD_V4F32 x2x3y2y3 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(3, 2, 3, 2));
62  AKSIMD_V4F32 z2z3z2z3 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(3, 2, 3, 2));
63 
64  out_v2 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(2, 0, 2, 0));
65  out_v3 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(3, 1, 3, 1));
66  }
67 
68  //Take 4 vectors <x,y,z,w> and return <x,x,x,x>, <y,y,y,y>, <z,z,z,z> and <w,w,w,w>
69  AkForceInline void PermuteVectors4(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
70  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz, AKSIMD_V4F32& out_wwww)
71  {
72  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
73  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
74  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
75  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
76 
77  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
78  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
79  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
80  out_wwww = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(3, 1, 3, 1));
81  }
82 
83  // 3-element dot product of 4 vectors.
85  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
86  {
87  return AKSIMD_ADD_V4F32(AKSIMD_ADD_V4F32(AKSIMD_MUL_V4F32(v0_x, v1_x), AKSIMD_MUL_V4F32(v0_y, v1_y)), AKSIMD_MUL_V4F32(v0_z, v1_z));
88  }
89 
90  // 3-element dot product of 1 common vector with 4 vectors
91  AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
92  {
93  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
94  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
95  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
96  return DotPoduct3_4x4(v0_x, v0_y, v0_z, v1_x, v1_y, v1_z);
97  }
98 
99  // 4-element dot product of 4 vectors.
100  AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32& v0_x, const AKSIMD_V4F32& v0_y, const AKSIMD_V4F32& v0_z, const AKSIMD_V4F32& v0_w,
101  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
102  {
103  return AKSIMD_ADD_V4F32(
105  AKSIMD_MUL_V4F32(v0_x, v1_x),
106  AKSIMD_MUL_V4F32(v0_y, v1_y)),
108  AKSIMD_MUL_V4F32(v0_z, v1_z),
109  AKSIMD_MUL_V4F32(v0_w, v1_w)));
110  }
111 
112  // 4-element dot product of 1 common vector with 4 vectors
113  AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
114  {
115  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
116  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
117  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
118  const AKSIMD_V4F32 v0_w = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
119  return DotPoduct4_4x4(v0_x, v0_y, v0_z, v0_w, v1_x, v1_y, v1_z, v1_w);
120  }
121 
122  // Trig functions approximation (based on the Fast versions found in AkMath.h)
124  {
125  const AKSIMD_V4F32 B = AKSIMD_SET_V4F32(4 / PI);
126  const AKSIMD_V4F32 C = AKSIMD_SET_V4F32(-4 / (PI * PI));
127  const AKSIMD_V4F32 P = AKSIMD_SET_V4F32(0.225f);
128 
129  //float y = B * x + C * x * fabs(x); //float y = X*(B+C*fabs(x));
130 
132  y = AKSIMD_MADD_V4F32(y, C, B);
133  y = AKSIMD_MUL_V4F32(y, x);
134 
135  // return P * (y * fabs(y) - y) + y;
136  AKSIMD_V4F32 sine = AKSIMD_ABS_V4F32(y);
137  sine = AKSIMD_MSUB_V4F32(y, sine, y);
138  sine = AKSIMD_MADD_V4F32(sine, P, y);
139  return sine;
140  }
141 
143  {
144  //Compute the offset needed for the cosinus. If you compare with FastCos, the constants have been combined.
145  const AKSIMD_V4F32 offsetNoWrap = AKSIMD_SET_V4F32(PI / 2); // cos = sin(x+pi/2)
146  const AKSIMD_V4F32 offsetWrap = AKSIMD_SET_V4F32(PI / 2 - 2 * PI); // Wrap: cos(x) = cos(x - 2 pi)
147  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
148 
149  // (cond1 >= cond2) ? a : b
150  AKSIMD_V4F32 offset = AKSIMD_SEL_GTEZ_V4F32(AKSIMD_SUB_V4F32(x, vHalfPI), offsetWrap, offsetNoWrap);
151  return AKSIMD_SIN_V4F32(AKSIMD_ADD_V4F32(x, offset));
152  }
153 
155  {
156  const AKSIMD_V4F32 vNeg = AKSIMD_SET_V4F32(-1.0f);
157  const AKSIMD_V4F32 vOne = AKSIMD_SET_V4F32(1.0f);
158  const AKSIMD_V4F32 vZero = AKSIMD_SET_V4F32(0.0f);
159  const AKSIMD_V4F32 vK = AKSIMD_SET_V4F32(0.28f);
160  const AKSIMD_V4F32 vKRepro = AKSIMD_SET_V4F32(1.f / 0.28f);
161  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
162  const AKSIMD_V4F32 vPI = AKSIMD_SET_V4F32(PI);
163  const AKSIMD_V4F32 vEpsilon = AKSIMD_SET_V4F32(1e-20f);
164 
165  //Ensure x is not zero a == 0 ? b : c.
166  x = AKSIMD_VSEL_V4F32(x, vEpsilon, AKSIMD_EQ_V4F32(x, vZero));
167 
168  AKSIMD_V4F32 z = AKSIMD_DIV_V4F32(y, x);
169  AKSIMD_V4F32 absz = AKSIMD_ABS_V4F32(z);
170  AKSIMD_V4COND zcond = AKSIMD_GTEQ_V4F32(vOne, absz);
171 
172  //The approximation is done in 2 segments of the form: offset + z/a*(z*z + b);
173 
174  //if ( fabsf( z ) < 1.0f ) then use .28 for the a coef
175  AKSIMD_V4F32 a = AKSIMD_VSEL_V4F32(vNeg, vK, zcond);
176 
177  //if ( fabsf( z ) < 1.0f ) then use 1 for the b factor, else use 0.28
178  AKSIMD_V4F32 b = AKSIMD_VSEL_V4F32(vK, vKRepro, zcond);
179 
180  AKSIMD_V4F32 atan = AKSIMD_MADD_V4F32(z, z, b);
181  atan = AKSIMD_MUL_V4F32(atan, a);
182  atan = AKSIMD_DIV_V4F32(z, atan);
183 
184  //Adjust for quadrant
185  // zcond x<0 y<0 offset
186  // 1 0 0 0
187  // 1 0 1 0
188  // 1 1 0 +PI
189  // 1 1 1 -PI
190  // 0 0 0 +PI/2
191  // 0 0 1 -PI/2
192  // 0 1 0 +PI/2
193  // 0 1 1 -PI/2
194 
195  AKSIMD_V4F32 offsetByX = AKSIMD_SEL_GTEZ_V4F32(x, vZero, vPI);
196  AKSIMD_V4F32 offset = AKSIMD_VSEL_V4F32(vHalfPI, offsetByX, zcond);
197  AKSIMD_V4F32 sign = AKSIMD_SEL_GTEZ_V4F32(y, vOne, vNeg);
198 
199  //Apply computed offset.
200  atan = AKSIMD_MADD_V4F32(offset, sign, atan);
201  return atan;
202  }
203 
204  //Accepts any positive x. Compare with FastSqrt() which accepts only between ]0,1]
206  {
208  return AKSIMD_GETELEMENT_V4F32(y, 0);
209  }
210 
211  //Compute 1/sqrt(x)
213  {
215  return AKSIMD_GETELEMENT_V4F32(y, 0);
216  }
217 
219  {
221  return AKSIMD_GETELEMENT_V4F32(y, 0);
222  }
223 }
224 
225 #endif //_AKSIMDMATH_H_
float AkReal32
32-bit floating point
Definition: AkTypes.h:70
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:536
AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:91
AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:113
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats.
Definition: AkSimd.h:73
AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v0_w, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:100
#define AKSIMD_VSEL_V4F32(__a__, __b__, __c__)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:933
#define AKSIMD_GTEQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:912
Definition: AkSimdMath.h:28
#define AKSIMD_RSQRT_V4F32(__a__)
Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
Definition: AkSimd.h:724
AkForceInline AKSIMD_V4F32 AKSIMD_COS_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:142
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results.
Definition: AkSimd.h:75
AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32 &xxxx, const AKSIMD_V4F32 &yyyy, const AKSIMD_V4F32 &zzzz, AKSIMD_V4F32 &out_v0, AKSIMD_V4F32 &out_v1, AKSIMD_V4F32 &out_v2, AKSIMD_V4F32 &out_v3)
Definition: AkSimdMath.h:44
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:680
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:549
AkForceInline AKSIMD_V4F32 DotPoduct3_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:84
#define AKSIMD_EQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:927
AkForceInline AkReal32 FastRSqrt(AkReal32 x)
Definition: AkSimdMath.h:212
#define AKSIMD_SUB_V4F32(__a__, __b__)
Definition: AkSimd.h:598
AkForceInline AKSIMD_V4F32 AKSIMD_ATAN2_V4F32(AKSIMD_V4F32 y, AKSIMD_V4F32 x)
Definition: AkSimdMath.h:154
#define AKSIMD_MSUB_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:688
#define AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:939
#define AKSIMD_SET_V4F32(__scalar__)
Definition: AkSimd.h:109
AkForceInline void PermuteVectors4(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz, AKSIMD_V4F32 &out_wwww)
Definition: AkSimdMath.h:69
AkForceInline void PermuteVectors3(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz)
Definition: AkSimdMath.h:30
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division.
Definition: AkSimd.h:650
AkForceInline AkReal32 FastSqrtLarge(AkReal32 x)
Definition: AkSimdMath.h:205
#define AKSIMD_RECIP_V4F32(__a__)
Reciprocal of x (1/x)
Definition: AkSimd.h:730
#define AKSIMD_ABS_V4F32(__a__)
Returns absolute value.
Definition: AkSimd.h:714
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:612
#define AKSIMD_GETELEMENT_V4F32(__vName, __num__)
Retrieve scalar element from vector.
Definition: AkSimd.h:38
AkForceInline AKSIMD_V4F32 AKSIMD_SIN_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:123
AkForceInline AkReal32 FastRcp(AkReal32 x)
Definition: AkSimdMath.h:218
#define AkForceInline
Definition: AkTypes.h:60
#define AKSIMD_SQRT_V4F32(__vec__)
Square root (4 floats)
Definition: AkSimd.h:721
#define AKSIMD_MUL_V4F32(__a__, __b__)
Definition: AkSimd.h:643