On 13.04.2013 01:55, Nozomi Kodama wrote:
Is there a problem with this patch?
http://source.winehq.org/patches/data/95555
Nozomi
Looks pretty much ok, but isn't there a way to reduce the size a bit?
Just see the dirty hack which is attached. D3DXSHMultiply6 will add a
lot of lines too...
Also is there a reason why we use constants with different accuracy
(e.g. 0.28209479f in D3DXSHMultiply4 and 0.2820948064f)?
Cheers
Rico
commit 5939b27e87acbd1cac30faa975e4c3d246ff5f9d
Author: Rico Schüller <kgbric...@web.de>
Date: Sun Apr 14 16:49:16 2013 +0200
d3dx9
diff --git a/dlls/d3dx9_36/math.c b/dlls/d3dx9_36/math.c
index 207d152..d675ad0 100644
--- a/dlls/d3dx9_36/math.c
+++ b/dlls/d3dx9_36/math.c
@@ -2580,6 +2580,63 @@ FLOAT * WINAPI D3DXSHMultiply2(FLOAT *out, const FLOAT *a, const FLOAT *b)
return out;
}
+#define c1(i1, m1)\
+ ta = m1 * a[i1];\
+ tb = m1 * b[i1];
+
+#define c2(i1, m1, i2, m2)\
+ ta = m1 * a[i1] + m2 * a[i2];\
+ tb = m1 * b[i1] + m2 * b[i2];
+
+#define c3(i1, m1, i2, m2, i3, m3)\
+ ta = m1 * a[i1] + m2 * a[i2] + m3 * a[i3];\
+ tb = m1 * b[i1] + m2 * b[i2] + m3 * b[i3];
+
+#define t1(i1, v1)\
+ out[i1] = v1 + ta * b[i1] + tb * a[i1];\
+ t = a[i1] * b[i1];
+
+#define t2(i1, v1, i2, v2)\
+ out[i1] = v1 + ta * b[i2] + tb * a[i2];\
+ out[i2] = v2 + ta * b[i1] + tb * a[i1];\
+ t = a[i1] * b[i2] + a[i2] * b[i1];
+
+#define c1_t1(i1, m1, v1, i2, v2)\
+ c1(i1, m1)\
+ t1(i2, v2)\
+ out[i1] = v1 + m1 * t;
+
+#define c1_t2(i1, m1, v1, i2, v2, i3, v3)\
+ c1(i1, m1)\
+ t2(i2, v2, i3, v3)\
+ out[i1] = v1 + m1 * t;
+
+#define c2_t1(i1, m1, v1, i2, m2, v2, i3, v3)\
+ c2(i1, m1, i2, m2)\
+ t1(i3, v3)\
+ out[i1] = v1 + m1 * t;\
+ out[i2] = v2 + m2 * t;
+
+#define c2_t2(i1, m1, v1, i2, m2, v2, i3, v3, i4, v4)\
+ c2(i1, m1, i2, m2)\
+ t2(i3, v3, i4, v4)\
+ out[i1] = v1 + m1 * t;\
+ out[i2] = v2 + m2 * t;
+
+#define c3_t1(i1, m1, v1, i2, m2, v2, i3, m3, v3, i4, v4)\
+ c3(i1, m1, i2, m2, i3, m3)\
+ t1(i4, v4)\
+ out[i1] = v1 + m1 * t;\
+ out[i2] = v2 + m2 * t;\
+ out[i3] = v3 + m3 * t;
+
+#define c3_t2(i1, m1, v1, i2, m2, v2, i3, m3, v3, i4, v4, i5, v5)\
+ c3(i1, m1, i2, m2, i3, m3)\
+ t2(i4, v4, i5, v5)\
+ out[i1] = v1 + m1 * t;\
+ out[i2] = v2 + m2 * t;\
+ out[i3] = v3 + m3 * t;
+
FLOAT * WINAPI D3DXSHMultiply3(FLOAT *out, const FLOAT *a, const FLOAT *b)
{
FLOAT t, ta, tb;
@@ -2587,94 +2644,19 @@ FLOAT * WINAPI D3DXSHMultiply3(FLOAT *out, const FLOAT *a, const FLOAT *b)
TRACE("out %p, a %p, b %p\n", out, a, b);
out[0] = 0.28209479f * a[0] * b[0];
-
- ta = 0.28209479f * a[0] - 0.12615662f * a[6] - 0.21850968f * a[8];
- tb = 0.28209479f * b[0] - 0.12615662f * b[6] - 0.21850968f * b[8];
- out[1] = ta * b[1] + tb * a[1];
- t = a[1] * b[1];
- out[0] += 0.28209479f * t;
- out[6] = -0.12615662f * t;
- out[8] = -0.21850968f * t;
-
- ta = 0.21850968f * a[5];
- tb = 0.21850968f * b[5];
- out[1] += ta * b[2] + tb * a[2];
- out[2] = ta * b[1] + tb * a[1];
- t = a[1] * b[2] +a[2] * b[1];
- out[5] = 0.21850968f * t;
-
- ta = 0.21850968f * a[4];
- tb = 0.21850968f * b[4];
- out[1] += ta * b[3] + tb * a[3];
- out[3] = ta * b[1] + tb * a[1];
- t = a[1] * b[3] + a[3] * b[1];
- out[4] = 0.21850968f * t;
-
- ta = 0.28209480f * a[0] + 0.25231326f * a[6];
- tb = 0.28209480f * b[0] + 0.25231326f * b[6];
- out[2] += ta * b[2] + tb * a[2];
- t = a[2] * b[2];
- out[0] += 0.28209480f * t;
- out[6] += 0.25231326f * t;
-
- ta = 0.21850969f * a[7];
- tb = 0.21850969f * b[7];
- out[2] += ta * b[3] + tb * a[3];
- out[3] += ta * b[2] + tb * a[2];
- t = a[2] * b[3] + a[3] * b[2];
- out[7] = 0.21850969f * t;
-
- ta = 0.28209479f * a[0] - 0.12615663f * a[6] + 0.21850969f * a[8];
- tb = 0.28209479f * b[0] - 0.12615663f * b[6] + 0.21850969f * b[8];
- out[3] += ta * b[3] + tb * a[3];
- t = a[3] * b[3];
- out[0] += 0.28209479f * t;
- out[6] -= 0.12615663f * t;
- out[8] += 0.21850969f * t;
-
- ta = 0.28209479f * a[0] - 0.18022375f * a[6];
- tb = 0.28209479f * b[0] - 0.18022375f * b[6];
- out[4] += ta * b[4] + tb * a[4];
- t = a[4] * b[4];
- out[0] += 0.28209479f * t;
- out[6] -= 0.18022375f * t;
-
- ta = 0.15607835f * a[7];
- tb = 0.15607835f * b[7];
- out[4] += ta * b[5] + tb * a[5];
- out[5] += ta * b[4] + tb * a[4];
- t = a[4] * b[5] + a[5] * b[4];
- out[7] += 0.15607834f * t;
-
- ta = 0.28209479f * a[0] + 0.09011186 * a[6] - 0.15607835f * a[8];
- tb = 0.28209479f * b[0] + 0.09011186 * b[6] - 0.15607835f * b[8];
- out[5] += ta * b[5] + tb * a[5];
- t = a[5] * b[5];
- out[0] += 0.28209479f * t;
- out[6] += 0.09011186f * t;
- out[8] -= 0.15607835f * t;
-
- ta = 0.28209480f * a[0];
- tb = 0.28209480f * b[0];
- out[6] += ta * b[6] + tb * a[6];
- t = a[6] * b[6];
- out[0] += 0.28209480f * t;
+ c3_t1(0, 0.28209479f, out[0], 6, -0.12615662f, 0.0f, 8, -0.21850968f, 0.0f, 1, 0.0f)
+ c1_t2(5, 0.21850968f, 0.0f, 1, out[1], 2, 0.0f)
+ c1_t2(4, 0.21850968f, 0.0f, 1, out[1], 3, 0.0f)
+ c2_t1(0, 0.28209480f, out[0], 6, 0.25231326f, out[6], 2, out[2])
+ c1_t2(7, 0.21850969, 0.0f, 2, out[2], 3, out[3])
+ c3_t1(0, 0.28209479f, out[0], 6, -0.12615662f, out[6], 8, 0.21850968f, out[8], 3, out[3])
+ c2_t1(0, 0.28209479f, out[0], 6, -0.18022375f, out[6], 4, out[4])
+ c1_t2(7, 0.15607835f, out[7], 4, out[4], 5, out[5])
+ c3_t1(0, 0.28209479f, out[0], 6, 0.09011186f, out[6], 8, -0.15607835f, out[8], 5, out[5])
+ c1_t1(0, 0.28209480f, out[0], 6, out[6])
out[6] += 0.18022376f * t;
-
- ta = 0.28209479f * a[0] + 0.09011186 * a[6] + 0.15607835f * a[8];
- tb = 0.28209479f * b[0] + 0.09011186 * b[6] + 0.15607835f * b[8];
- out[7] += ta * b[7] + tb * a[7];
- t = a[7] * b[7];
- out[0] += 0.28209479f * t;
- out[6] += 0.09011186f * t;
- out[8] += 0.15607835f * t;
-
- ta = 0.28209479f * a[0] - 0.18022375f * a[6];
- tb = 0.28209479f * b[0] - 0.18022375f * b[6];
- out[8] += ta * b[8] + tb * a[8];
- t = a[8] * b[8];
- out[0] += 0.28209479f * t;
- out[6] -= 0.18022375f * t;
+ c3_t1(0, 0.28209479f, out[0], 6, 0.09011186f, out[6], 8, 0.15607835f, out[8], 7, out[7])
+ c2_t1(0, 0.28209479f, out[0], 6, -0.18022375f, out[6], 8, out[8])
return out;
}
@@ -2686,283 +2668,43 @@ FLOAT * WINAPI D3DXSHMultiply4(FLOAT *out, const FLOAT *a, const FLOAT *b)
TRACE("out %p, a %p, b %p\n", out, a, b);
out[0] = 0.28209479f * a[0] * b[0];
-
- ta = 0.28209479f * a[0] - 0.12615663f * a[6] - 0.21850969f * a[8];
- tb = 0.28209479f * b[0] - 0.12615663f * b[6] - 0.21850969f * b[8];
- out[1] = ta * b[1] + tb * a[1];
- t = a[1] * b[1];
- out[0] += 0.28209479f * t;
- out[6] = -0.12615663f * t;
- out[8] = -0.21850969f * t;
-
- ta = 0.21850969f * a[3] - 0.05839917f * a[13] - 0.22617901f * a[15];
- tb = 0.21850969f * b[3] - 0.05839917f * b[13] - 0.22617901f * b[15];
- out[1] += ta * b[4] + tb * a[4];
- out[4] = ta * b[1] + tb * a[1];
- t = a[1] * b[4] + a[4] * b[1];
- out[3] = 0.21850969f * t;
- out[13] = -0.05839917f * t;
- out[15] = -0.22617901f * t;
-
- ta = 0.21850969f * a[2] - 0.14304817f * a[12] - 0.18467439f * a[14];
- tb = 0.21850969f * b[2] - 0.14304817f * b[12] - 0.18467439f * b[14];
- out[1] += ta * b[5] + tb * a[5];
- out[5] = ta * b[1] + tb * a[1];
- t = a[1] * b[5] + a[5] * b[1];
- out[2] = 0.21850969f * t;
- out[12] = -0.14304817f * t;
- out[14] = -0.18467439f * t;
-
- ta = 0.20230066f * a[11];
- tb = 0.20230066f * b[11];
- out[1] += ta * b[6] + tb * a[6];
- out[6] += ta * b[1] + tb * a[1];
- t = a[1] * b[6] + a[6] * b[1];
- out[11] = 0.20230066f * t;
-
- ta = 0.22617901f * a[9] + 0.05839917f * a[11];
- tb = 0.22617901f * b[9] + 0.05839917f * b[11];
- out[1] += ta * b[8] + tb * a[8];
- out[8] += ta * b[1] + tb * a[1];
- t = a[1] * b[8] + a[8] * b[1];
- out[9] = 0.22617901f * t;
- out[11] += 0.05839917f * t;
-
- ta = 0.28209480f * a[0] + 0.25231326f * a[6];
- tb = 0.28209480f * b[0] + 0.25231326f * b[6];
- out[2] += ta * b[2] + tb * a[2];
- t = a[2] * b[2];
- out[0] += 0.28209480f * t;
- out[6] += 0.25231326f * t;
-
- ta = 0.24776671f * a[12];
- tb = 0.24776671f * b[12];
- out[2] += ta * b[6] + tb * a[6];
- out[6] += ta * b[2] + tb * a[2];
- t = a[2] * b[6] + a[6] * b[2];
- out[12] += 0.24776671f * t;
-
- ta = 0.28209480f * a[0] - 0.12615663f * a[6] + 0.21850969f * a[8];
- tb = 0.28209480f * b[0] - 0.12615663f * b[6] + 0.21850969f * b[8];
- out[3] += ta * b[3] + tb * a[3];
- t = a[3] * b[3];
- out[0] += 0.28209480f * t;
- out[6] -= 0.12615663f * t;
- out[8] += 0.21850969f * t;
-
- ta = 0.20230066f * a[13];
- tb = 0.20230066f * b[13];
- out[3] += ta * b[6] + tb * a[6];
- out[6] += ta * b[3] + tb * a[3];
- t = a[3] * b[6] + a[6] * b[3];
- out[13] += 0.20230066f * t;
-
- ta = 0.21850969f * a[2] - 0.14304817f * a[12] + 0.18467439f * a[14];
- tb = 0.21850969f * b[2] - 0.14304817f * b[12] + 0.18467439f * b[14];
- out[3] += ta * b[7] + tb * a[7];
- out[7] = ta * b[3] + tb * a[3];
- t = a[3] * b[7] + a[7] * b[3];
- out[2] += 0.21850969f * t;
- out[12] -= 0.14304817f * t;
- out[14] += 0.18467439f * t;
-
- ta = -0.05839917f * a[13] + 0.22617901f * a[15];
- tb = -0.05839917f * b[13] + 0.22617901f * b[15];
- out[3] += ta * b[8] + tb * a[8];
- out[8] += ta * b[3] + tb * a[3];
- t = a[3] * b[8] + a[8] * b[3];
- out[13] -= 0.05839917f * t;
- out[15] += 0.22617901f * t;
-
- ta = 0.28209479f * a[0] - 0.18022375f * a[6];
- tb = 0.28209479f * b[0] - 0.18022375f * b[6];
- out[4] += ta * b[4] + tb * a[4];
- t = a[4] * b[4];
- out[0] += 0.28209479f * t;
- out[6] -= 0.18022375f * t;
-
- ta = 0.15607835f * a[7];
- tb = 0.15607835f * b[7];
- out[4] += ta * b[5] + tb * a[5];
- out[5] += ta * b[4] + tb * a[4];
- t = a[4] * b[5] + a[5] * b[4];
- out[7] += 0.15607835f * t;
-
- ta = 0.22617901f * a[3] - 0.09403160f * a[13];
- tb = 0.22617901f * b[3] - 0.09403160f * b[13];
- out[4] += ta * b[9] + tb * a[9];
- out[9] += ta * b[4] + tb * a[4];
- t = a[4] * b[9] + a[9] * b[4];
- out[3] += 0.22617901f * t;
- out[13] -= 0.09403160f * t;
-
- ta = 0.18467439f * a[2] - 0.18806319f * a[12];
- tb = 0.18467439f * b[2] - 0.18806319f * b[12];
- out[4] += ta * b[10] + tb * a [10];
- out[10] = ta * b[4] + tb * a[4];
- t = a[4] * b[10] + a[10] * b[4];
- out[2] += 0.18467439f * t;
- out[12] -= 0.18806319f * t;
-
- ta = -0.05839917f * a[3] + 0.14567312f * a[13] + 0.09403160f * a[15];
- tb = -0.05839917f * b[3] + 0.14567312f * b[13] + 0.09403160f * b[15];
- out[4] += ta * b[11] + tb * a[11];
- out[11] += ta * b[4] + tb * a[4];
- t = a[4] * b[11] + a[11] * b[4];
- out[3] -= 0.05839917f * t;
- out[13] += 0.14567312f * t;
- out[15] += 0.09403160f * t;
-
- ta = 0.28209479f * a[0] + 0.09011186f * a[6] - 0.15607835f * a[8];
- tb = 0.28209479f * b[0] + 0.09011186f * b[6] - 0.15607835f * b[8];
- out[5] += ta * b[5] + tb * a[5];
- t = a[5] * b[5];
- out[0] += 0.28209479f * t;
- out[6] += 0.09011186f * t;
- out[8] -= 0.15607835f * t;
-
- ta = 0.14867701f * a[14];
- tb = 0.14867701f * b[14];
- out[5] += ta * b[9] + tb * a[9];
- out[9] += ta * b[5] + tb * a[5];
- t = a[5] * b[9] + a[9] * b[5];
- out[14] += 0.14867701f * t;
-
- ta = 0.18467439f * a[3] + 0.11516472f * a[13] - 0.14867701f * a[15];
- tb = 0.18467439f * b[3] + 0.11516472f * b[13] - 0.14867701f * b[15];
- out[5] += ta * b[10] + tb * a[10];
- out[10] += ta * b[5] + tb * a[5];
- t = a[5] * b[10] + a[10] * b[5];
- out[3] += 0.18467439f * t;
- out[13] += 0.11516472f * t;
- out[15] -= 0.14867701f * t;
-
- ta = 0.23359668f * a[2] + 0.05947080f * a[12] - 0.11516472f * a[14];
- tb = 0.23359668f * b[2] + 0.05947080f * b[12] - 0.11516472f * b[14];
- out[5] += ta * b[11] + tb * a[11];
- out[11] += ta * b[5] + tb * a[5];
- t = a[5] * b[11] + a[11] * b[5];
- out[2] += 0.23359668f * t;
- out[12] += 0.05947080f * t;
- out[14] -= 0.11516472f * t;
-
- ta = 0.28209479f * a[0];
- tb = 0.28209479f * b[0];
- out[6] += ta * b[6] + tb * a[6];
- t = a[6] * b[6];
- out[0] += 0.28209479f * t;
+ c3_t1(0, 0.28209479f, out[0], 6, -0.12615663f, 0.0f, 8, -0.21850969f, 0.0f, 1, 0.0f)
+ c3_t2(3, 0.21850969f, 0.0f, 13, -0.05839917f, 0.0f, 15, -0.22617901f, 0.0f, 1, out[1], 4, 0.0f)
+ c3_t2(2, 0.21850969f, 0.0f, 12, -0.14304817f, 0.0f, 14, -0.18467439f, 0.0f, 1, out[1], 5, 0.0f)
+ c1_t2(11, 0.20230066f, 0.0f, 1, out[1], 6, out[6])
+ c2_t2(9, 0.22617901f, 0.0f, 11, 0.05839917f, out[11], 1, out[1], 8, out[8])
+ c2_t1(0, 0.28209480f, out[0], 6, 0.25231326f, out[6], 2, out[2])
+ c1_t2(12, 0.24776671f, out[12], 2, out[2], 6, out[6])
+ c3_t1(0, 0.28209480f, out[0], 6, -0.12615663f, out[6], 8, 0.21850969f, out[8], 3, out[3])
+ c1_t2(13, 0.20230066f, out[13], 3, out[3], 6, out[6])
+ c3_t2(2, 0.21850969f, out[2], 12, -0.14304817f, out[12], 14, 0.18467439f, out[14], 3, out[3], 7, 0.0f)
+ c2_t2(13, -0.05839917f, out[13], 15, 0.22617901f, out[15], 3, out[3], 8, out[8])
+ c2_t1(0, 0.28209479f, out[0], 6, -0.18022375f, out[6], 4, out[4])
+ c1_t2(7, 0.15607835f, out[7], 4, out[4], 5, out[5])
+ c2_t2(3, 0.22617901f, out[3], 13, -0.09403160f, out[13], 4, out[4], 9, out[9])
+ c2_t2(2, 0.18467439f, out[2], 12, -0.18806319f, out[12], 4, out[4], 10, 0.0f)
+ c3_t2(3, -0.05839917f, out[3], 13, 0.14567312f, out[13], 15, 0.09403160f, out[15], 4, out[4], 11, out[11])
+ c3_t1(0, 0.28209479f, out[0], 6, 0.09011186f, out[6], 8, -0.15607835f, out[8], 5, out[5])
+ c1_t2(14, 0.14867701f, out[14], 5, out[5], 9, out[9])
+ c3_t2(3, 0.18467439f, out[3], 13, 0.11516472f, out[13], 15, -0.14867701f, out[15], 5, out[5], 10, out[10])
+ c3_t2(2, 0.23359668f, out[2], 12, 0.05947080f, out[12], 14, -0.11516472f, out[14], 5, out[5], 11, out[11])
+ c1_t1(0, 0.28209479f, out[0], 6, out[6])
out[6] += 0.18022376f * t;
-
- ta = 0.09011186f * a[6] + 0.28209479f * a[0] + 0.15607835f * a[8];
- tb = 0.09011186f * b[6] + 0.28209479f * b[0] + 0.15607835f * b[8];
- out[7] += ta * b[7] + tb * a[7];
- t = a[7] * b[7];
- out[6] += 0.09011186f * t;
- out[0] += 0.28209479f * t;
- out[8] += 0.15607835f * t;
-
- ta = 0.14867701f * a[9] + 0.18467439f * a[1] + 0.11516472f * a[11];
- tb = 0.14867701f * b[9] + 0.18467439f * b[1] + 0.11516472f * b[11];
- out[7] += ta * b[10] + tb * a[10];
- out[10] += ta * b[7] + tb * a[7];
- t = a[7] * b[10] + a[10] * b[7];
- out[9] += 0.14867701f * t;
- out[1] += 0.18467439f * t;
- out[11] += 0.11516472f * t;
-
- ta = 0.05947080f * a[12] + 0.23359668f * a[2] + 0.11516472f * a[14];
- tb = 0.05947080f * b[12] + 0.23359668f * b[2] + 0.11516472f * b[14];
- out[7] += ta * b[13] + tb * a[13];
- out[13] += ta * b[7]+ tb * a[7];
- t = a[7] * b[13] + a[13] * b[7];
- out[12] += 0.05947080f * t;
- out[2] += 0.23359668f * t;
- out[14] += 0.11516472f * t;
-
- ta = 0.14867701f * a[15];
- tb = 0.14867701f * b[15];
- out[7] += ta * b[14] + tb * a[14];
- out[14] += ta * b[7] + tb * a[7];
- t = a[7] * b[14] + a[14] * b[7];
- out[15] += 0.14867701f * t;
-
- ta = 0.28209479f * a[0] - 0.18022375f * a[6];
- tb = 0.28209479f * b[0] - 0.18022375f * b[6];
- out[8] += ta * b[8] + tb * a[8];
- t = a[8] * b[8];
- out[0] += 0.28209479f * t;
- out[6] -= 0.18022375f * t;
-
- ta = -0.09403160f * a[11];
- tb = -0.09403160f * b[11];
- out[8] += ta * b[9] + tb * a[9];
- out[9] += ta * b[8] + tb * a[8];
- t = a[8] * b[9] + a[9] * b[8];
- out[11] -= 0.09403160f * t;
-
- ta = -0.09403160f * a[15];
- tb = -0.09403160f * b[15];
- out[8] += ta * b[13] + tb * a[13];
- out[13] += ta * b[8] + tb * a[8];
- t = a[8] * b[13] + a[13] * b[8];
- out[15] -= 0.09403160f * t;
-
- ta = 0.18467439f * a[2] - 0.18806319f * a[12];
- tb = 0.18467439f * b[2] - 0.18806319f * b[12];
- out[8] += ta * b[14] + tb * a[14];
- out[14] += ta * b[8] + tb * a[8];
- t = a[8] * b[14] + a[14] * b[8];
- out[2] += 0.18467439f * t;
- out[12] -= 0.18806319f * t;
-
- ta = -0.21026104f * a[6] + 0.28209479f * a[0];
- tb = -0.21026104f * b[6] + 0.28209479f * b[0];
- out[9] += ta * b[9] + tb * a[9];
- t = a[9] * b[9];
- out[6] -= 0.21026104f * t;
- out[0] += 0.28209479f * t;
-
- ta = 0.28209479f * a[0];
- tb = 0.28209479f * b[0];
- out[10] += ta * b[10] + tb * a[10];
- t = a[10] * b[10];
- out[0] += 0.28209479f * t;
-
- ta = 0.28209479f * a[0] + 0.12615663f * a[6] - 0.14567312f * a[8];
- tb = 0.28209479f * b[0] + 0.12615663f * b[6] - 0.14567312f * b[8];
- out[11] += ta * b[11] + tb * a[11];
- t = a[11] * b[11];
- out[0] += 0.28209479f * t;
- out[6] += 0.12615663f * t;
- out[8] -= 0.14567312f * t;
-
- ta = 0.28209479f * a[0] + 0.16820885f * a[6];
- tb = 0.28209479f * b[0] + 0.16820885f * b[6];
- out[12] += ta * b[12] + tb * a[12];
- t = a[12] * b[12];
- out[0] += 0.28209479f * t;
- out[6] += 0.16820885f * t;
-
- ta =0.28209479f * a[0] + 0.14567312f * a[8] + 0.12615663f * a[6];
- tb =0.28209479f * b[0] + 0.14567312f * b[8] + 0.12615663f * b[6];
- out[13] += ta * b[13] + tb * a[13];
- t = a[13] * b[13];
- out[0] += 0.28209479f * t;
- out[8] += 0.14567312f * t;
- out[6] += 0.12615663f * t;
-
- ta = 0.28209479f * a[0];
- tb = 0.28209479f * b[0];
- out[14] += ta * b[14] + tb * a[14];
- t = a[14] * b[14];
- out[0] += 0.28209479f * t;
-
- ta = 0.28209479f * a[0] - 0.21026104f * a[6];
- tb = 0.28209479f * b[0] - 0.21026104f * b[6];
- out[15] += ta * b[15] + tb * a[15];
- t = a[15] * b[15];
- out[0] += 0.28209479f * t;
- out[6] -= 0.21026104f * t;
+ c3_t1(0, 0.28209479f, out[0], 6, 0.09011186f, out[6], 8, 0.15607835f, out[8], 7, out[7])
+ c3_t2(9, 0.14867701f, out[9], 1, 0.18467439f, out[1], 11, 0.11516472f, out[11], 7, out[7], 10, out[10])
+ c3_t2(12, 0.05947080f, out[12], 2, 0.23359668f, out[2], 14, 0.11516472f, out[14], 7, out[7], 13, out[13])
+ c1_t2(15, 0.14867701f, out[15], 7, out[7], 14, out[14])
+ c2_t1(0, 0.28209479f, out[0], 6, -0.18022375f, out[6], 8, out[8])
+ c1_t2(11, -0.09403160f, out[11], 8, out[8], 9, out[9])
+ c1_t2(15, -0.09403160f, out[15], 8, out[8], 13, out[13])
+ c2_t2(2, 0.18467439f, out[2], 12, -0.18806319f, out[12], 8, out[8], 14, out[14])
+ c2_t1(0, 0.28209479f, out[0], 6, -0.21026104f, out[6], 9, out[9])
+ c1_t1(0, 0.28209479f, out[0], 10, out[10])
+ c3_t1(0, 0.28209479f, out[0], 6, 0.12615663f, out[6], 8, -0.14567312f, out[8], 11, out[11])
+ c2_t1(0, 0.28209479f, out[0], 6, 0.16820885f, out[6], 12, out[12])
+ c3_t1(0, 0.28209479f, out[0], 8, 0.14567312f, out[8], 6, 0.12615663f, out[6], 13, out[13])
+ c1_t1(0, 0.28209479f, out[0], 14, out[14])
+ c2_t1(0, 0.28209479f, out[0], 6, -0.21026104f, out[6], 15, out[15])
return out;
}