00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "lux.h"
00025
00026 #ifndef LUX_USE_SSE
00027 #include "matrix4x4.h"
00028 #include "error.h"
00029
00030 namespace lux
00031 {
00032
00033 Matrix4x4::Matrix4x4(float mat[4][4])
00034 {
00035 memcpy(m, mat, 16 * sizeof(float));
00036 }
00037
00038 Matrix4x4::Matrix4x4(float t00, float t01, float t02, float t03,
00039 float t10, float t11, float t12, float t13,
00040 float t20, float t21, float t22, float t23,
00041 float t30, float t31, float t32, float t33)
00042 {
00043 m[0][0] = t00; m[0][1] = t01; m[0][2] = t02; m[0][3] = t03;
00044 m[1][0] = t10; m[1][1] = t11; m[1][2] = t12; m[1][3] = t13;
00045 m[2][0] = t20; m[2][1] = t21; m[2][2] = t22; m[2][3] = t23;
00046 m[3][0] = t30; m[3][1] = t31; m[3][2] = t32; m[3][3] = t33;
00047 }
00048
00049 boost::shared_ptr<Matrix4x4> Matrix4x4::Transpose() const
00050 {
00051 boost::shared_ptr<Matrix4x4> o(new Matrix4x4(m[0][0], m[1][0], m[2][0], m[3][0],
00052 m[0][1], m[1][1], m[2][1], m[3][1],
00053 m[0][2], m[1][2], m[2][2], m[3][2],
00054 m[0][3], m[1][3], m[2][3], m[3][3]));
00055 return o;
00056 }
00057
00058 boost::shared_ptr<Matrix4x4> Matrix4x4::Inverse() const
00059 {
00060 int indxc[4], indxr[4];
00061 int ipiv[4] = { 0, 0, 0, 0 };
00062 float minv[4][4];
00063 memcpy(minv, m, 4 * 4 * sizeof(float));
00064 for (int i = 0; i < 4; ++i) {
00065 int irow = -1, icol = -1;
00066 float big = 0.;
00067
00068 for (int j = 0; j < 4; ++j) {
00069 if (ipiv[j] != 1) {
00070 for (int k = 0; k < 4; ++k) {
00071 if (ipiv[k] == 0) {
00072 if (fabsf(minv[j][k]) >= big) {
00073 big = fabsf(minv[j][k]);
00074 irow = j;
00075 icol = k;
00076 }
00077 } else if (ipiv[k] > 1)
00078 luxError(LUX_MATH, LUX_ERROR, "Singular matrix in MatrixInvert");
00079 }
00080 }
00081 }
00082 ++ipiv[icol];
00083
00084 if (irow != icol) {
00085 for (int k = 0; k < 4; ++k)
00086 swap(minv[irow][k], minv[icol][k]);
00087 }
00088 indxr[i] = irow;
00089 indxc[i] = icol;
00090 if (minv[icol][icol] == 0.)
00091 luxError(LUX_MATH, LUX_ERROR, "Singular matrix in MatrixInvert");
00092
00093 float pivinv = 1.f / minv[icol][icol];
00094 minv[icol][icol] = 1.f;
00095 for (int j = 0; j < 4; ++j)
00096 minv[icol][j] *= pivinv;
00097
00098 for (int j = 0; j < 4; ++j) {
00099 if (j != icol) {
00100 float save = minv[j][icol];
00101 minv[j][icol] = 0;
00102 for (int k = 0; k < 4; ++k)
00103 minv[j][k] -= minv[icol][k] * save;
00104 }
00105 }
00106 }
00107
00108 for (int j = 3; j >= 0; --j) {
00109 if (indxr[j] != indxc[j]) {
00110 for (int k = 0; k < 4; ++k)
00111 swap(minv[k][indxr[j]], minv[k][indxc[j]]);
00112 }
00113 }
00114
00115 boost::shared_ptr<Matrix4x4> o(new Matrix4x4(minv));
00116 return o;
00117 }
00118
00119 }
00120
00121 #else // LUX_USE_SSE
00122
00123 #include "matrix4x4-sse.h"
00124
00125 typedef long int32;
00126 #ifdef __GNUC__
00127 const float __attribute__ ((aligned(16))) _F1001[4] = { 1.0f, 0.0f, 0.0f, 1.0f };
00128 const int32 __attribute__ ((aligned(16))) _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
00129 #else //intel compiler
00130 const _MM_ALIGN16 float _F1001[4] = { 1.0f, 0.0f, 0.0f, 1.0f };
00131 const _MM_ALIGN16 __int32 _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
00132 #endif
00133 #define Sign_PNNP (*(__m128*)&_Sign_PNNP) // + - - +
00134
00135 namespace lux
00136 {
00137
00138 Matrix4x4::Matrix4x4(float mat[4][4])
00139 {
00140 _L1 = _mm_set_ps(mat[3][0], mat[2][0], mat[1][0], mat[0][0]);
00141 _L2 = _mm_set_ps(mat[3][1], mat[2][1], mat[1][1], mat[0][1]);
00142 _L3 = _mm_set_ps(mat[3][2], mat[2][2], mat[1][2], mat[0][2]);
00143 _L4 = _mm_set_ps(mat[3][3], mat[2][3], mat[1][3], mat[0][3]);
00144 }
00145
00146 Matrix4x4::Matrix4x4(float f11, float f12, float f13, float f14,
00147 float f21, float f22, float f23, float f24,
00148 float f31, float f32, float f33, float f34,
00149 float f41, float f42, float f43, float f44)
00150 {
00151 _L1 = _mm_set_ps(f41, f31, f21, f11);
00152 _L2 = _mm_set_ps(f42, f32, f22, f12);
00153 _L3 = _mm_set_ps(f43, f33, f23, f13);
00154 _L4 = _mm_set_ps(f44, f34, f24, f14);
00155 }
00156
00157 boost::shared_ptr<Matrix4x4> Matrix4x4::Transpose() const
00158 {
00159 __m128 xmm0 = _mm_unpacklo_ps(_L1, _L2),
00160 xmm1 = _mm_unpacklo_ps(_L3, _L4),
00161 xmm2 = _mm_unpackhi_ps(_L1, _L2),
00162 xmm3 = _mm_unpackhi_ps(_L3, _L4);
00163
00164 __m128 L1, L2, L3, L4;
00165 L1 = _mm_movelh_ps(xmm0, xmm1);
00166 L2 = _mm_movehl_ps(xmm1, xmm0);
00167 L3 = _mm_movelh_ps(xmm2, xmm3);
00168 L4 = _mm_movehl_ps(xmm3, xmm2);
00169
00170 boost::shared_ptr<Matrix4x4> o(new Matrix4x4(L1, L2, L3, L4));
00171 return o;
00172 }
00173
00174 boost::shared_ptr<Matrix4x4> Matrix4x4::Inverse() const
00175 {
00176 __m128 A = _mm_movelh_ps(_L1, _L2),
00177 B = _mm_movehl_ps(_L2, _L1),
00178 C = _mm_movelh_ps(_L3, _L4),
00179 D = _mm_movehl_ps(_L4, _L3);
00180 __m128 iA, iB, iC, iD,
00181 DC, AB;
00182 __m128 dA, dB, dC, dD;
00183 __m128 det, d, d1, d2;
00184 __m128 rd;
00185
00186
00187 AB = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x0F), B);
00188 AB = _mm_sub_ps(AB, _mm_mul_ps(_mm_shuffle_ps(A, A, 0xA5),
00189 _mm_shuffle_ps(B, B, 0x4E)));
00190
00191 DC = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x0F), C);
00192 DC = _mm_sub_ps(DC, _mm_mul_ps(_mm_shuffle_ps(D, D, 0xA5),
00193 _mm_shuffle_ps(C, C, 0x4E)));
00194
00195
00196 dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F), A);
00197 dA = _mm_sub_ss(dA, _mm_movehl_ps(dA, dA));
00198
00199 dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F), B);
00200 dB = _mm_sub_ss(dB, _mm_movehl_ps(dB, dB));
00201
00202
00203 dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F), C);
00204 dC = _mm_sub_ss(dC, _mm_movehl_ps(dC, dC));
00205
00206 dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F), D);
00207 dD = _mm_sub_ss(dD, _mm_movehl_ps(dD, dD));
00208
00209
00210 d = _mm_mul_ps(_mm_shuffle_ps(DC, DC, 0xD8), AB);
00211
00212
00213 iD = _mm_mul_ps(_mm_shuffle_ps(C, C, 0xA0), _mm_movelh_ps(AB, AB));
00214 iD = _mm_add_ps(iD, _mm_mul_ps(_mm_shuffle_ps(C, C, 0xF5),
00215 _mm_movehl_ps(AB, AB)));
00216
00217 iA = _mm_mul_ps(_mm_shuffle_ps(B, B, 0xA0), _mm_movelh_ps(DC, DC));
00218 iA = _mm_add_ps(iA, _mm_mul_ps(_mm_shuffle_ps(B, B, 0xF5),
00219 _mm_movehl_ps(DC, DC)));
00220
00221
00222 d = _mm_add_ps(d, _mm_movehl_ps(d, d));
00223 d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
00224 d1 = _mm_mul_ps(dA, dD);
00225 d2 = _mm_mul_ps(dB, dC);
00226
00227
00228 iD = _mm_sub_ps(_mm_mul_ps(D, _mm_shuffle_ps(dA, dA, 0)), iD);
00229
00230
00231 iA = _mm_sub_ps(_mm_mul_ps(A, _mm_shuffle_ps(dD, dD, 0)), iA);
00232
00233
00234 det = _mm_sub_ps(_mm_add_ps(d1, d2), d);
00235 rd = (__m128)(_mm_div_ps(_mm_set_ss(1.0f), det));
00236 #ifdef ZERO_SINGULAR
00237 rd = _mm_and_ps(_mm_cmpneq_ss(det, _mm_setzero_ps()), rd);
00238 #endif
00239
00240
00241 iB = _mm_mul_ps(D, _mm_shuffle_ps(AB, AB, 0x33));
00242 iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D, D, 0xB1),
00243 _mm_shuffle_ps(AB, AB, 0x66)));
00244
00245 iC = _mm_mul_ps(A, _mm_shuffle_ps(DC, DC, 0x33));
00246 iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A, A, 0xB1),
00247 _mm_shuffle_ps(DC, DC, 0x66)));
00248
00249 rd = _mm_shuffle_ps(rd, rd, 0);
00250 rd = _mm_xor_ps(rd, Sign_PNNP);
00251
00252
00253 iB = _mm_sub_ps(_mm_mul_ps(C, _mm_shuffle_ps(dB, dB, 0)), iB);
00254
00255
00256 iC = _mm_sub_ps(_mm_mul_ps(B, _mm_shuffle_ps(dC, dC, 0)) ,iC);
00257
00258
00259 iA =_mm_mul_ps(iA, rd);
00260 iB =_mm_mul_ps(iB, rd);
00261 iC =_mm_mul_ps(iC, rd);
00262 iD =_mm_mul_ps(iD, rd);
00263
00264
00265
00266 boost::shared_ptr<Matrix4x4>
00267 o(new Matrix4x4(_mm_shuffle_ps(iA, iB, 0x77),
00268 _mm_shuffle_ps(iA, iB, 0x22),
00269 _mm_shuffle_ps(iC, iD, 0x77),
00270 _mm_shuffle_ps(iC, iD, 0x22)));
00271 return o;
00272 }
00273
00274 }
00275
00276 #endif // LUX_USE_SSE
00277
00278
00279