Actual source code: baijsolvnat4.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /*
5: Special case where the matrix was ILU(0) factored in the natural
6: ordering. This eliminates the need for the column and row permutation.
7: */
8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
9: {
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
11: PetscInt n =a->mbs;
12: const PetscInt *ai=a->i,*aj=a->j;
13: const PetscInt *diag = a->diag;
14: const MatScalar *aa =a->a;
15: PetscScalar *x;
16: const PetscScalar *b;
18: VecGetArrayRead(bb,&b);
19: VecGetArray(xx,&x);
21: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
22: {
23: static PetscScalar w[2000]; /* very BAD need to fix */
24: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
25: }
26: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
27: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
28: #else
29: {
30: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
31: const MatScalar *v;
32: PetscInt jdx,idt,idx,nz,i,ai16;
33: const PetscInt *vi;
35: /* forward solve the lower triangular */
36: idx = 0;
37: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
38: for (i=1; i<n; i++) {
39: v = aa + 16*ai[i];
40: vi = aj + ai[i];
41: nz = diag[i] - ai[i];
42: idx += 4;
43: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
44: while (nz--) {
45: jdx = 4*(*vi++);
46: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
47: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
48: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
49: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
50: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
51: v += 16;
52: }
53: x[idx] = s1;
54: x[1+idx] = s2;
55: x[2+idx] = s3;
56: x[3+idx] = s4;
57: }
58: /* backward solve the upper triangular */
59: idt = 4*(n-1);
60: for (i=n-1; i>=0; i--) {
61: ai16 = 16*diag[i];
62: v = aa + ai16 + 16;
63: vi = aj + diag[i] + 1;
64: nz = ai[i+1] - diag[i] - 1;
65: s1 = x[idt]; s2 = x[1+idt];
66: s3 = x[2+idt];s4 = x[3+idt];
67: while (nz--) {
68: idx = 4*(*vi++);
69: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
70: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
71: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
72: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
73: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
74: v += 16;
75: }
76: v = aa + ai16;
77: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
78: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
79: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
80: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
81: idt -= 4;
82: }
83: }
84: #endif
86: VecRestoreArrayRead(bb,&b);
87: VecRestoreArray(xx,&x);
88: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
89: return 0;
90: }
92: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
93: {
94: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
95: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
96: PetscInt i,k,nz,idx,jdx,idt;
97: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
98: const MatScalar *aa=a->a,*v;
99: PetscScalar *x;
100: const PetscScalar *b;
101: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
103: VecGetArrayRead(bb,&b);
104: VecGetArray(xx,&x);
105: /* forward solve the lower triangular */
106: idx = 0;
107: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
108: for (i=1; i<n; i++) {
109: v = aa + bs2*ai[i];
110: vi = aj + ai[i];
111: nz = ai[i+1] - ai[i];
112: idx = bs*i;
113: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
114: for (k=0; k<nz; k++) {
115: jdx = bs*vi[k];
116: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
117: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
118: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
119: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
120: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
122: v += bs2;
123: }
125: x[idx] = s1;
126: x[1+idx] = s2;
127: x[2+idx] = s3;
128: x[3+idx] = s4;
129: }
131: /* backward solve the upper triangular */
132: for (i=n-1; i>=0; i--) {
133: v = aa + bs2*(adiag[i+1]+1);
134: vi = aj + adiag[i+1]+1;
135: nz = adiag[i] - adiag[i+1]-1;
136: idt = bs*i;
137: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
139: for (k=0; k<nz; k++) {
140: idx = bs*vi[k];
141: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
142: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
143: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
144: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
145: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
147: v += bs2;
148: }
149: /* x = inv_diagonal*x */
150: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
151: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
152: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
153: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
155: }
157: VecRestoreArrayRead(bb,&b);
158: VecRestoreArray(xx,&x);
159: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
160: return 0;
161: }
163: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
164: {
165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
166: const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
167: const MatScalar *aa=a->a;
168: const PetscScalar *b;
169: PetscScalar *x;
171: VecGetArrayRead(bb,&b);
172: VecGetArray(xx,&x);
174: {
175: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
176: const MatScalar *v;
177: MatScalar *t=(MatScalar*)x;
178: PetscInt jdx,idt,idx,nz,i,ai16;
179: const PetscInt *vi;
181: /* forward solve the lower triangular */
182: idx = 0;
183: t[0] = (MatScalar)b[0];
184: t[1] = (MatScalar)b[1];
185: t[2] = (MatScalar)b[2];
186: t[3] = (MatScalar)b[3];
187: for (i=1; i<n; i++) {
188: v = aa + 16*ai[i];
189: vi = aj + ai[i];
190: nz = diag[i] - ai[i];
191: idx += 4;
192: s1 = (MatScalar)b[idx];
193: s2 = (MatScalar)b[1+idx];
194: s3 = (MatScalar)b[2+idx];
195: s4 = (MatScalar)b[3+idx];
196: while (nz--) {
197: jdx = 4*(*vi++);
198: x1 = t[jdx];
199: x2 = t[1+jdx];
200: x3 = t[2+jdx];
201: x4 = t[3+jdx];
202: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
203: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
204: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
205: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
206: v += 16;
207: }
208: t[idx] = s1;
209: t[1+idx] = s2;
210: t[2+idx] = s3;
211: t[3+idx] = s4;
212: }
213: /* backward solve the upper triangular */
214: idt = 4*(n-1);
215: for (i=n-1; i>=0; i--) {
216: ai16 = 16*diag[i];
217: v = aa + ai16 + 16;
218: vi = aj + diag[i] + 1;
219: nz = ai[i+1] - diag[i] - 1;
220: s1 = t[idt];
221: s2 = t[1+idt];
222: s3 = t[2+idt];
223: s4 = t[3+idt];
224: while (nz--) {
225: idx = 4*(*vi++);
226: x1 = (MatScalar)x[idx];
227: x2 = (MatScalar)x[1+idx];
228: x3 = (MatScalar)x[2+idx];
229: x4 = (MatScalar)x[3+idx];
230: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
231: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
232: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
233: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
234: v += 16;
235: }
236: v = aa + ai16;
237: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
238: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
239: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
240: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
241: idt -= 4;
242: }
243: }
245: VecRestoreArrayRead(bb,&b);
246: VecRestoreArray(xx,&x);
247: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
248: return 0;
249: }
251: #if defined(PETSC_HAVE_SSE)
253: #include PETSC_HAVE_SSE
254: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
255: {
256: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
257: unsigned short *aj=(unsigned short*)a->j;
258: int *ai=a->i,n=a->mbs,*diag = a->diag;
259: MatScalar *aa=a->a;
260: PetscScalar *x,*b;
262: SSE_SCOPE_BEGIN;
263: /*
264: Note: This code currently uses demotion of double
265: to float when performing the mixed-mode computation.
266: This may not be numerically reasonable for all applications.
267: */
268: PREFETCH_NTA(aa+16*ai[1]);
270: VecGetArray(bb,&b);
271: VecGetArray(xx,&x);
272: {
273: /* x will first be computed in single precision then promoted inplace to double */
274: MatScalar *v,*t=(MatScalar*)x;
275: int nz,i,idt,ai16;
276: unsigned int jdx,idx;
277: unsigned short *vi;
278: /* Forward solve the lower triangular factor. */
280: /* First block is the identity. */
281: idx = 0;
282: CONVERT_DOUBLE4_FLOAT4(t,b);
283: v = aa + 16*((unsigned int)ai[1]);
285: for (i=1; i<n;) {
286: PREFETCH_NTA(&v[8]);
287: vi = aj + ai[i];
288: nz = diag[i] - ai[i];
289: idx += 4;
291: /* Demote RHS from double to float. */
292: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
293: LOAD_PS(&t[idx],XMM7);
295: while (nz--) {
296: PREFETCH_NTA(&v[16]);
297: jdx = 4*((unsigned int)(*vi++));
299: /* 4x4 Matrix-Vector product with negative accumulation: */
300: SSE_INLINE_BEGIN_2(&t[jdx],v)
301: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
303: /* First Column */
304: SSE_COPY_PS(XMM0,XMM6)
305: SSE_SHUFFLE(XMM0,XMM0,0x00)
306: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
307: SSE_SUB_PS(XMM7,XMM0)
309: /* Second Column */
310: SSE_COPY_PS(XMM1,XMM6)
311: SSE_SHUFFLE(XMM1,XMM1,0x55)
312: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
313: SSE_SUB_PS(XMM7,XMM1)
315: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
317: /* Third Column */
318: SSE_COPY_PS(XMM2,XMM6)
319: SSE_SHUFFLE(XMM2,XMM2,0xAA)
320: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
321: SSE_SUB_PS(XMM7,XMM2)
323: /* Fourth Column */
324: SSE_COPY_PS(XMM3,XMM6)
325: SSE_SHUFFLE(XMM3,XMM3,0xFF)
326: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
327: SSE_SUB_PS(XMM7,XMM3)
328: SSE_INLINE_END_2
330: v += 16;
331: }
332: v = aa + 16*ai[++i];
333: PREFETCH_NTA(v);
334: STORE_PS(&t[idx],XMM7);
335: }
337: /* Backward solve the upper triangular factor.*/
339: idt = 4*(n-1);
340: ai16 = 16*diag[n-1];
341: v = aa + ai16 + 16;
342: for (i=n-1; i>=0;) {
343: PREFETCH_NTA(&v[8]);
344: vi = aj + diag[i] + 1;
345: nz = ai[i+1] - diag[i] - 1;
347: LOAD_PS(&t[idt],XMM7);
349: while (nz--) {
350: PREFETCH_NTA(&v[16]);
351: idx = 4*((unsigned int)(*vi++));
353: /* 4x4 Matrix-Vector Product with negative accumulation: */
354: SSE_INLINE_BEGIN_2(&t[idx],v)
355: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
357: /* First Column */
358: SSE_COPY_PS(XMM0,XMM6)
359: SSE_SHUFFLE(XMM0,XMM0,0x00)
360: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
361: SSE_SUB_PS(XMM7,XMM0)
363: /* Second Column */
364: SSE_COPY_PS(XMM1,XMM6)
365: SSE_SHUFFLE(XMM1,XMM1,0x55)
366: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
367: SSE_SUB_PS(XMM7,XMM1)
369: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
371: /* Third Column */
372: SSE_COPY_PS(XMM2,XMM6)
373: SSE_SHUFFLE(XMM2,XMM2,0xAA)
374: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
375: SSE_SUB_PS(XMM7,XMM2)
377: /* Fourth Column */
378: SSE_COPY_PS(XMM3,XMM6)
379: SSE_SHUFFLE(XMM3,XMM3,0xFF)
380: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
381: SSE_SUB_PS(XMM7,XMM3)
382: SSE_INLINE_END_2
383: v += 16;
384: }
385: v = aa + ai16;
386: ai16 = 16*diag[--i];
387: PREFETCH_NTA(aa+ai16+16);
388: /*
389: Scale the result by the diagonal 4x4 block,
390: which was inverted as part of the factorization
391: */
392: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
393: /* First Column */
394: SSE_COPY_PS(XMM0,XMM7)
395: SSE_SHUFFLE(XMM0,XMM0,0x00)
396: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
398: /* Second Column */
399: SSE_COPY_PS(XMM1,XMM7)
400: SSE_SHUFFLE(XMM1,XMM1,0x55)
401: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
402: SSE_ADD_PS(XMM0,XMM1)
404: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
406: /* Third Column */
407: SSE_COPY_PS(XMM2,XMM7)
408: SSE_SHUFFLE(XMM2,XMM2,0xAA)
409: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
410: SSE_ADD_PS(XMM0,XMM2)
412: /* Fourth Column */
413: SSE_COPY_PS(XMM3,XMM7)
414: SSE_SHUFFLE(XMM3,XMM3,0xFF)
415: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
416: SSE_ADD_PS(XMM0,XMM3)
418: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
419: SSE_INLINE_END_3
421: v = aa + ai16 + 16;
422: idt -= 4;
423: }
425: /* Convert t from single precision back to double precision (inplace)*/
426: idt = 4*(n-1);
427: for (i=n-1; i>=0; i--) {
428: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
429: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
430: PetscScalar *xtemp=&x[idt];
431: MatScalar *ttemp=&t[idt];
432: xtemp[3] = (PetscScalar)ttemp[3];
433: xtemp[2] = (PetscScalar)ttemp[2];
434: xtemp[1] = (PetscScalar)ttemp[1];
435: xtemp[0] = (PetscScalar)ttemp[0];
436: idt -= 4;
437: }
439: } /* End of artificial scope. */
440: VecRestoreArray(bb,&b);
441: VecRestoreArray(xx,&x);
442: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
443: SSE_SCOPE_END;
444: return 0;
445: }
447: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
448: {
449: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
450: int *aj=a->j;
451: int *ai=a->i,n=a->mbs,*diag = a->diag;
452: MatScalar *aa=a->a;
453: PetscScalar *x,*b;
455: SSE_SCOPE_BEGIN;
456: /*
457: Note: This code currently uses demotion of double
458: to float when performing the mixed-mode computation.
459: This may not be numerically reasonable for all applications.
460: */
461: PREFETCH_NTA(aa+16*ai[1]);
463: VecGetArray(bb,&b);
464: VecGetArray(xx,&x);
465: {
466: /* x will first be computed in single precision then promoted inplace to double */
467: MatScalar *v,*t=(MatScalar*)x;
468: int nz,i,idt,ai16;
469: int jdx,idx;
470: int *vi;
471: /* Forward solve the lower triangular factor. */
473: /* First block is the identity. */
474: idx = 0;
475: CONVERT_DOUBLE4_FLOAT4(t,b);
476: v = aa + 16*ai[1];
478: for (i=1; i<n;) {
479: PREFETCH_NTA(&v[8]);
480: vi = aj + ai[i];
481: nz = diag[i] - ai[i];
482: idx += 4;
484: /* Demote RHS from double to float. */
485: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
486: LOAD_PS(&t[idx],XMM7);
488: while (nz--) {
489: PREFETCH_NTA(&v[16]);
490: jdx = 4*(*vi++);
491: /* jdx = *vi++; */
493: /* 4x4 Matrix-Vector product with negative accumulation: */
494: SSE_INLINE_BEGIN_2(&t[jdx],v)
495: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
497: /* First Column */
498: SSE_COPY_PS(XMM0,XMM6)
499: SSE_SHUFFLE(XMM0,XMM0,0x00)
500: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
501: SSE_SUB_PS(XMM7,XMM0)
503: /* Second Column */
504: SSE_COPY_PS(XMM1,XMM6)
505: SSE_SHUFFLE(XMM1,XMM1,0x55)
506: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
507: SSE_SUB_PS(XMM7,XMM1)
509: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
511: /* Third Column */
512: SSE_COPY_PS(XMM2,XMM6)
513: SSE_SHUFFLE(XMM2,XMM2,0xAA)
514: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
515: SSE_SUB_PS(XMM7,XMM2)
517: /* Fourth Column */
518: SSE_COPY_PS(XMM3,XMM6)
519: SSE_SHUFFLE(XMM3,XMM3,0xFF)
520: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
521: SSE_SUB_PS(XMM7,XMM3)
522: SSE_INLINE_END_2
524: v += 16;
525: }
526: v = aa + 16*ai[++i];
527: PREFETCH_NTA(v);
528: STORE_PS(&t[idx],XMM7);
529: }
531: /* Backward solve the upper triangular factor.*/
533: idt = 4*(n-1);
534: ai16 = 16*diag[n-1];
535: v = aa + ai16 + 16;
536: for (i=n-1; i>=0;) {
537: PREFETCH_NTA(&v[8]);
538: vi = aj + diag[i] + 1;
539: nz = ai[i+1] - diag[i] - 1;
541: LOAD_PS(&t[idt],XMM7);
543: while (nz--) {
544: PREFETCH_NTA(&v[16]);
545: idx = 4*(*vi++);
546: /* idx = *vi++; */
548: /* 4x4 Matrix-Vector Product with negative accumulation: */
549: SSE_INLINE_BEGIN_2(&t[idx],v)
550: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
552: /* First Column */
553: SSE_COPY_PS(XMM0,XMM6)
554: SSE_SHUFFLE(XMM0,XMM0,0x00)
555: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
556: SSE_SUB_PS(XMM7,XMM0)
558: /* Second Column */
559: SSE_COPY_PS(XMM1,XMM6)
560: SSE_SHUFFLE(XMM1,XMM1,0x55)
561: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
562: SSE_SUB_PS(XMM7,XMM1)
564: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
566: /* Third Column */
567: SSE_COPY_PS(XMM2,XMM6)
568: SSE_SHUFFLE(XMM2,XMM2,0xAA)
569: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
570: SSE_SUB_PS(XMM7,XMM2)
572: /* Fourth Column */
573: SSE_COPY_PS(XMM3,XMM6)
574: SSE_SHUFFLE(XMM3,XMM3,0xFF)
575: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
576: SSE_SUB_PS(XMM7,XMM3)
577: SSE_INLINE_END_2
578: v += 16;
579: }
580: v = aa + ai16;
581: ai16 = 16*diag[--i];
582: PREFETCH_NTA(aa+ai16+16);
583: /*
584: Scale the result by the diagonal 4x4 block,
585: which was inverted as part of the factorization
586: */
587: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
588: /* First Column */
589: SSE_COPY_PS(XMM0,XMM7)
590: SSE_SHUFFLE(XMM0,XMM0,0x00)
591: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
593: /* Second Column */
594: SSE_COPY_PS(XMM1,XMM7)
595: SSE_SHUFFLE(XMM1,XMM1,0x55)
596: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
597: SSE_ADD_PS(XMM0,XMM1)
599: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
601: /* Third Column */
602: SSE_COPY_PS(XMM2,XMM7)
603: SSE_SHUFFLE(XMM2,XMM2,0xAA)
604: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
605: SSE_ADD_PS(XMM0,XMM2)
607: /* Fourth Column */
608: SSE_COPY_PS(XMM3,XMM7)
609: SSE_SHUFFLE(XMM3,XMM3,0xFF)
610: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
611: SSE_ADD_PS(XMM0,XMM3)
613: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
614: SSE_INLINE_END_3
616: v = aa + ai16 + 16;
617: idt -= 4;
618: }
620: /* Convert t from single precision back to double precision (inplace)*/
621: idt = 4*(n-1);
622: for (i=n-1; i>=0; i--) {
623: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
624: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
625: PetscScalar *xtemp=&x[idt];
626: MatScalar *ttemp=&t[idt];
627: xtemp[3] = (PetscScalar)ttemp[3];
628: xtemp[2] = (PetscScalar)ttemp[2];
629: xtemp[1] = (PetscScalar)ttemp[1];
630: xtemp[0] = (PetscScalar)ttemp[0];
631: idt -= 4;
632: }
634: } /* End of artificial scope. */
635: VecRestoreArray(bb,&b);
636: VecRestoreArray(xx,&x);
637: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
638: SSE_SCOPE_END;
639: return 0;
640: }
642: #endif